十一个爆火的Python爬虫实战项目源码不用谢
Posted 日常分享Python
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了十一个爆火的Python爬虫实战项目源码不用谢相关的知识,希望对你有一定的参考价值。
目录
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
Python爬虫:爬取豆瓣电影中速度与激情8演员图片
import urllib.request
import os
import re
def douban(url):
r = urllib.request.urlopen(url)
html = r.read().decode('utf-8')
result = re.findall(r'https://img\\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
result2 = re.findall(r'(?<=title=").\\S+', html)
result2.pop()
result3 = sorted(set(result2), key=result2.index)
result3.pop(-3)
if not os.path.exists('douban'):
os.makedirs('douban')
i = 0
for link in result:
filename = 'douban\\\\' + str(result3[i]) + '.jpg'
i += 1
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)
url = 'https://movie.douban.com/subject/26260853/celebrities'
if __name__ == '__main__':
douban(url)
Python爬虫:斗鱼弹幕相关信息保存到mongodb
# 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
__author__ = '布咯咯_rieuse'
__time__ = '2017.6.2'
__github__ = 'https://github.com/rieuse'
import multiprocessing
import re
import socket
import time
import pymongo
import requests
from bs4 import BeautifulSoup
clients = pymongo.MongoClient('localhost')
db = clients["DouyuTV_danmu"]
col = db["info"]
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host = socket.gethostbyname("openbarrage.douyutv.com")
port = 8601
client.connect((host, port))
danmu_path = re.compile(b'txt@=(.+?)/cid@')
uid_path = re.compile(b'uid@=(.+?)/nn@')
nickname_path = re.compile(b'nn@=(.+?)/txt@')
level_path = re.compile(b'level@=([1-9][0-9]?)/sahf')
def sendmsg(msgstr):
msg = msgstr.encode('utf-8')
data_length = len(msg) + 8
code = 689
msgHead = int.to_bytes(data_length, 4, 'little') \\
+ int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
client.send(msgHead)
sent = 0
while sent < len(msg):
tn = client.send(msg[sent:])
sent = sent + tn
def start(roomid):
msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\\0'.format(roomid)
sendmsg(msg)
msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\\0'.format(roomid)
sendmsg(msg_more)
print('---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
while True:
data = client.recv(1024)
uid_more = uid_path.findall(data)
nickname_more = nickname_path.findall(data)
level_more = level_path.findall(data)
danmu_more = danmu_path.findall(data)
if not level_more:
level_more = b'0'
if not data:
break
else:
for i in range(0, len(danmu_more)):
try:
product = {
'uid': uid_more[0].decode(encoding='utf-8'),
'nickname': nickname_more[0].decode(encoding='utf-8'),
'level': level_more[0].decode(encoding='utf-8'),
'danmu': danmu_more[0].decode(encoding='utf-8')
}
print(product)
col.insert(product)
print('成功导入mongodb')
except Exception as e:
print(e)
def keeplive():
while True:
msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\\0'
sendmsg(msg)
time.sleep(15)
def get_name(roomid):
r = requests.get("http://www.douyu.com/" + roomid)
soup = BeautifulSoup(r.text, 'lxml')
return soup.find('a', {'class', 'zb-name'}).string
if __name__ == '__main__':
room_id = input('请出入房间ID: ')
p1 = multiprocessing.Process(target=start, args=(room_id,))
p2 = multiprocessing.Process(target=keeplive)
p1.start()
p2.start()
Python爬虫:抓取喜马拉雅电台音频
_author__ = '布咯咯_rieuse'
import json
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
from lxml import etree
clients = pymongo.MongoClient('localhost')
db = clients["XiMaLaYa"]
col1 = db["album2"]
col2 = db["detaile2"]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers1 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(UA_LIST)
}
headers2 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.ximalaya.com/dq/all/2',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(UA_LIST)
}
def get_url():
start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]
for start_url in start_urls:
html = requests.get(start_url, headers=headers1).text
soup = BeautifulSoup(html, 'lxml')
for item in soup.find_all(class_="albumfaceOutter"):
content = {
'href': item.a['href'],
'title': item.img['alt'],
'img_url': item.img['src']
}
col1.insert(content)
print('写入一个频道' + item.a['href'])
print(content)
another(item.a['href'])
time.sleep(1)
def another(url):
html = requests.get(url, headers=headers2).text
ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
if len(ifanother):
num = ifanother[0]
print('本频道资源存在' + num + '个页面')
for n in range(1, int(num)):
print('开始解析{}个中的第{}个页面'.format(num, n))
url2 = url + '?page={}'.format(n)
get_m4a(url2)
get_m4a(url)
def get_m4a(url):
time.sleep(1)
html = requests.get(url, headers=headers2).text
numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
for i in numlist:
murl = 'http://www.ximalaya.com/tracks/{}.json'.format(i)
html = requests.get(murl, headers=headers1).text
dic = json.loads(html)
col2.insert(dic)
print(murl + '中的数据已被成功插入mongodb')
if __name__ == '__main__':
get_url()
Python爬虫—抓包分析爬取实习僧全部招聘信息
import json
import requests
import pymongo
import time
clients = pymongo.MongoClient('localhost')
db = clients["Shixiseng"]
col = db["detail_info"]
urls = ['http://www.shixiseng.com/app/internsvt?c=%E5%85%A8%E5%9B%BD&p={}&t=hot'.format(n) for n in range(1, 3487)]
for url in urls:
print(url)
r = requests.get(url)
html = r.content.decode('utf-8')
content = json.loads(html)['msg']['b']
for i in content:
print('插入一条数据:')
print(i)
col.insert(i)
time.sleep(0.01)
Python爬虫:批量抓取花瓣网高清美图并保存
__author__ = '布咯咯_rieuse'
import os
import lxml.html
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser = webdriver.Firefox()
wait = WebDriverWait(browser, 5)
browser.set_window_size(1400, 900)
def parser(url, param):
# 解析模块
browser.get(url)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
html = browser.page_source
doc = lxml.html.fromstring(html)
return doc
def get_main_url():
print('打开主页搜寻链接中...')
try:
doc = parser('http://huaban.com/boards/favorite/beauty/', '#waterfall')
name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
for item, fileName in zip(u, name):
main_url = 'http://huaban.com' + item
print('主链接已找到' + main_url)
if '*' in fileName:
fileName = fileName.replace('*', '')
download(main_url, fileName)
except Exception as e:
print(e)
def download(main_url, fileName):
print('-------准备下载中-------')
try:
doc = parser(main_url, '#waterfall')
if not os.path.exists('image\\\\' + fileName):
print('创建文件夹...')
os.makedirs('image\\\\' + fileName)
link = doc.xpath('//*[@id="waterfall"]/div/a/@href')
# print(link)
i = 0
for item in link:
i += 1
minor_url = 'http://huaban.com' + item
doc = parser(minor_url, '#pin_view_page')
img_url = doc.xpath('//*[@id="baidu_image_holder"]/a/img/@src')
img_url2 = doc.xpath('//*[@id="baidu_image_holder"]/img/@src')
img_url += img_url2
try:
url = 'http:' + str(img_url[0])
print('正在下载第' + str(i) + '张图片,地址:' + url)
r = requests.get(url)
filename = 'image\\\\{}\\\\'.format(fileName) + str(i) + '.jpg'
with open(filename, 'wb') as fo:
fo.write(r.content)
except Exception:
print('出错了!')
except Exception:
print('出错啦!')
if __name__ == '__main__':
get_main_url()
Python爬虫:爬取v2ex数据用csv保存
import csv, requests, re
from bs4 import BeautifulSoup
url = 'https://www.v2ex.com/?tab=all'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
articles = []
for article in soup.find_all(class_='cell item'):
title = article.find(class_='item_title').get_text()
category = article.find(class_='node').get_text()
author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
u = article.select('.item_title > a')
link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
articles.append([title, category, author, link])
with open(r'document\\v2ex.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['文章标题', '分类', '作者', '文章地址'])
for row in articles:
writer.writerow(row)
Python爬虫:豌豆荚设计奖三种爬取方法速度对比
__author__ = '布咯咯_rieuse'
import asyncio
import random
import time
import aiohttp
import pymongo
import requests
import multiprocessing
from bs4 import BeautifulSoup
# 共用部分
clients = pymongo.MongoClient('localhost')
db = clients["wandoujia"]
col = db["info"]
urls = ['http://www.wandoujia.com/award?page={}'.format(num) for num in range(1, 46)]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.wandoujia.com',
'User-Agent': random.choice(UA_LIST)
}
proxies = {
'http': 'http://123.206.6.17:3128',
'https': 'http://123.206.6.17:3128'
}
# 方式一:使用常见的requests
def method_1():
start = time.time()
for url in urls:
html = requests.get(url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
col.insert(content)
print('成功插入一组数据' + str(content))
print('一共用时:' + str(time.time() - start))
# if __name__ == '__main__':
# method_1()
# 方式二:使用Requests + Pool
def method_2(url):
html = requests.get(url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
# time.sleep(1)
col.insert(content)
print('成功插入一组数据' + str(content))
# if __name__ == '__main__':
# start = time.time()
# pool = multiprocessing.Pool(4)
# pool.map(method_2, urls)
# pool.close()
# pool.join()
# print('一共用时:' + str(time.time() - start))
# 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
def method_3():
async def get_url(url):
async with aiohttp.ClientSession() as session: # async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。
async with session.get(url) as html:
response = await html.text(encoding="utf-8") # await关键字将暂停协程函数的执行,等待异步IO返回结果。
return response
async def parser(url):
html = await get_url(url)
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
col.insert(content)
print('成功插入一组数据' + str(content))
start = time.time()
loop = asyncio.get_event_loop()
tasks = [parser(url) for url in urls]
loop.run_until_complete(asyncio.gather(*tasks))
print(time.time() - start)
if __name__ == '__main__':
method_3()
© 2021 GitHub, Inc.
Python爬虫:使用lxml解析HTML,输出对应值
import requests
import lxml.html
url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'
html = requests.get(url).text
doc = lxml.html.fromstring(html)
titles = doc.xpath('//div[@class="newsList"]/ul/li/a/text()')
href = doc.xpath('//div[@class="newsList"]/ul/li/a/@href')
i = 0
for content in titles:
results = {
'标题': titles[i],
'链接': href[i]
}
i += 1
print(results)
Python爬虫:使用Selenium爬取一点资讯动态数据
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
driver = webdriver.Firefox()
driver.implicitly_wait(3)
first_url = 'http://www.yidianzixun.com/channel/c6'
driver.get(first_url)
driver.find_element_by_class_name('icon-refresh').click()
for i in range(1, 90):
driver.find_element_by_class_name('icon-refresh').send_keys(Keys.DOWN)
soup = BeautifulSoup(driver.page_source, 'lxml')
articles = []
for article in soup.find_all(class_='item doc style-small-image style-content-middle'):
title = article.find(class_='doc-title').get_text()
source = article.find(class_='source').get_text()
comment = article.find(class_='comment-count').get_text()
link = 'http://www.yidianzixun.com' + article.get('href')
articles.append([title, source, comment, link])
driver.quit()
with open(r'document\\yidian.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['文章标题', '作者', '评论数', '文章地址'])
for row in articles:
writer.writerow(row)
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from bs4 import BeautifulSoup
import lxml.html
import pymongo
import re
MONGO_URL = 'localhost'
MONGO_DB = 'amazon'
MONGO_TABLE = 'amazon-python'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
KEYWORD = 'python'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)
def search():
print('正在搜索')
try:
browser.get('https://www.amazon.cn/')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox'))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input')))
input.send_keys(KEYWORD)
submit.click()
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > span.pagnDisabled')))
get_products()
print('一共' + total.text + '页')
return total.text
except TimeoutException:
return search()
def next_page(number):
print('正在翻页', number)
try:
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#pagnNextString'), '下一页'))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pagnNextString')))
submit.click()
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '.pagnCur'), str(number)))
get_products()
except TimeoutException:
next_page(number)
def get_products():
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf')))
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
doc = lxml.html.fromstring(html)
date = doc.xpath('//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()')
content = soup.find_all(attrs={"id": re.compile(r'result_\\d+')})
for item, time in zip(content, date):
product = {
'title': item.find(class_='s-access-title').get_text(),
'image': item.find(class_='s-access-image cfMarker').get('src'),
'price': item.find(class_='a-size-base a-color-price s-price a-text-bold').get_text(),
'date': time
}
# save_to_mongo(product)
print(product)
except Exception as e:
print(e)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存储到mongodb成功', result)
except Exception:
print('存储到mongodb失败', result)
def main():
try:
total = int(search())
for i in range(2, total + 1):
next_page(i)
except Exception as e:
print('出错啦', e)
finally:
browser.close()
if __name__ == '__main__':
main()
Python爬虫:获取黑大验证码并登录
import requests
from PIL import Image
from bs4 import BeautifulSoup
url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?'
url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal'
url3 = 'http://my.hlju.edu.cn/index.portal'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
s = requests.session()
response = s.get(url1, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
with open('img\\code.jpg', 'wb') as f:
f.write(response.content)
img = Image.open('img\\code.jpg')
img.show()
data = {}
data['Login.Token1'] = '20154433'
data['Login.Token2'] = ''
data['captcha'] = input('输入验证码:')
data['goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal'
data['gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal'
response2 = s.post(url=url2, data=data, headers=headers)
response3 = s.get(url3, headers=headers)
print(response3.text)
以上是关于十一个爆火的Python爬虫实战项目源码不用谢的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫项目实战—全站 950 套美女写真套图爬虫下载