用python抓一了一些数据存到本地

Posted 2020-09-17

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了用python抓一了一些数据存到本地相关的知识，希望对你有一定的参考价值。

import codecs

from xml.dom.minidom import Document
import requests
from bs4 import BeautifulSoup

doc = Document()
def getAllUrl(pageCount):
    url=‘https://www.xxx.co/xxxx/{page}‘
    return  url.format(page=pageCount)

def getHtml(pageCount):
    html = requests.get(getAllUrl(pageCount))
    return html

def WirteXml(gName,gImg,wUrl):
    girlName = gName
    girlImage = gImg
    webUrl = wUrl
    name = doc.createElement("name")
    aperson.appendChild(name)
    personname = doc.createTextNode(girlName)
    name.appendChild(personname)
    img = doc.createElement("imgUrl")
    aperson.appendChild(img)
    prersonUrl = doc.createTextNode(girlImage)
    img.append.Child(prersonUrl)
    weburl = doc.createElement("webUrl")
    aperson.appendChild(weburl)
    personname = doc.createTextNode(webUrl)
    weburl.appendChild(personname)

if __name__ == ‘__main__‘:
   # f = codecs.open(‘Conker.txt‘, ‘w‘, ‘utf-8‘)
    filename = "people.xml"
    f = codecs.open(filename, "w", ‘utf-8‘)
    people = doc.createElement("Actresses")
    doc.appendChild(people)
    aperson = doc.createElement("person")
    people.appendChild(aperson)
    for count in range(1,1250):
      html = getHtml(count).text
      soup= BeautifulSoup(html,"lxml")
      trs=soup.findAll("img")
      length=len(trs)
      for i in range(length):
        try:
            girlName = trs[i].attrs["title"]
            girlImage = trs[i].attrs["src"]
            webUrl ="https://www.xxx.co/xx/"+trs[i].attrs["src"].split(‘/‘)[-1][:-6]
            WirteXml(girlName,girlImage,webUrl)
        except:
           None
      print("第"+str(count)+"页抓完！！！")
    f.write(doc.toprettyxml(indent="  "))
    f.close()

以上是关于用python抓一了一些数据存到本地的主要内容，如果未能解决你的问题，请参考以下文章