python 爬虫示例,方便日后参考
Posted YuQiao0303
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫示例,方便日后参考相关的知识,希望对你有一定的参考价值。
def getOneMoviesInfo(Mid,url):
import requests
from lxml import etree
#print(url)
data = requests.get(url).text #download the website
s = etree.HTML(data) #analyse data
picture = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src‘)
if len(picture)== 0:
picture = ‘NULL‘
#longPicture = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src‘)
name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘)
if len(name)==0:
print("Mid = %s , failed for a lack of TMDB id "%Mid)
return
name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘)[0]
year = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()‘)[0].strip("(").strip().strip(")")
date = s.xpath(‘//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()‘)[1].strip()
brief = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()‘)[0].replace("
","\n")
mainCreators =s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li‘) #all main creators array
writers = []
director = "NULL"
for div in mainCreators:
if len(div.xpath(‘./p[1]/a/text()‘))== 0:
director = ‘NULL‘
writers = [‘NULL‘,‘NULL‘,‘NULL‘]
else:
creatorName = div.xpath(‘./p[1]/a/text()‘)[0]
#print(creatorName)
creatorProfession = div.xpath(‘./p[2]/text()‘)[0]
#print(creatorProfession)
if ‘Director‘ in creatorProfession:
director = creatorName
elif ‘Screenplay‘ in creatorProfession or ‘Writer‘ in creatorProfession:
writers.append(creatorName)
stars = []
starsData = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li‘)
for div in starsData:
star = div.xpath(‘./p[1]/a/text()‘)
if len(star)== 0:
stars == ["NULL","NULL","NULL"]
else:
star = star[0]
stars.append(star)
writerslen = len(writers)
starslen=len(stars)
for i in range(writerslen,3):
writers.append("NULL");
for i in range(starslen,5):
stars.append("NULL");
with open(r‘C:UsersyuqiaoDesktop estSpider.txt‘,‘a‘,encoding=‘utf-8‘) as f:
f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}
".format(Mid,name,brief,year,date,director,
writers[0],writers[1],writers[2],
stars[0],stars[1],stars[2],stars[3],stars[4],
picture))
print(Mid)
print(name)
#______________________________________________________主函数__________________________________________________________
import time
with open(r‘C:UsersyuqiaoDesktop estSpider.txt‘,‘w‘,encoding=‘utf-8‘) as f:
f.write("")
language = ‘?language=zh-CN‘ #######################
with open(r‘D:gitiyeMovieMidURL.txt‘, "rt",encoding=‘utf-8‘) as in_file:
all = in_file.read()
lines = all.split("
")
#for i in range(51,61): 51~60
for i in range(9124,9125):
line = lines[i]
print(line)
print(‘finished‘)
以上是关于python 爬虫示例,方便日后参考的主要内容,如果未能解决你的问题,请参考以下文章
#yyds干货盘点# Python网络爬虫之js逆向之远程调用(rpc)免去抠代码补环境简介
# yyds干货盘点 # Python网络爬虫之js逆向之远程调用(rpc)免去抠代码补环境简介
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段