python 百度空间博文爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 百度空间博文爬虫相关的知识,希望对你有一定的参考价值。
#!/usr/bin/env python
# coding=utf8
# author=evi1m0#n0tr00t
# Fri Apr 10 14:14:35 2015
import os
import re
import sys
import wget
import requests
import urlparse
import threadpool as tp
def _archives(author):
archives_url = 'http://hi.baidu.com/{}/archive'.format(author)
print '[*] Target URL: {}'.format(archives_url)
year_content = requests.get(archives_url).content
years = re.findall('<div class=fi-list id=fiList>(.*?)</section>', year_content)[0]
months = re.findall('<a href="(.*?)" class="fi-border-bt2', years)
print '[*] Months count: {}'.format(len(months))
months_url = []
archives_list = []
for month in months:
if 'month=' in urlparse.urlparse(month).query:
months_url.append(month)
for url in months_url:
month_content = requests.get(url).content
urls = re.findall('</div><a href="(.*?)" class=info-detail target=_blank>', month_content)
for u in urls:
archives_list.append(u)
return archives_list
def main(url):
_page = requests.get(url).content
_title = re.findall('<h2 class="title content-title">(.*?)</h2>', _page)[0]
_filename = '{author}/{title}'.format(author=sys.argv[1], title=_title)
print '[+] Download: {}'.format(_title)
try:
wget.download(url, out=_filename, bar='')
except Exception, e:
print '[-] Error: ' + str(e)
if __name__ == '__main__':
if len(sys.argv) == 1:
print '[-] Usage: {} Blog_name'.format(sys.argv[0])
print '[-] Example: {} evi1m0'.format(sys.argv[0])
sys.exit()
author = sys.argv[1]
if not os.path.exists(author):
os.mkdir(author)
archives = _archives(author)
print '[*] Archives statistics: {}'.format(len(archives))
# threadpool
pool = tp.ThreadPool(30)
reqs = tp.makeRequests(main, archives)
[pool.putRequest(req) for req in reqs]
pool.wait()
以上是关于python 百度空间博文爬虫的主要内容,如果未能解决你的问题,请参考以下文章
Python3 爬虫 -- BeautifulSoup之再次爬取CSDN博文
如何用Python爬虫获取那些价值博文
一篇博文让你了解,Python爬虫库的代理设置方法!
python爬虫百度翻译
Python爬虫-百度贴吧
如何用Python爬虫实现百度图片自动下载?