http://blog.csdn.net/qq_22073849/article/details/78018980 小说爬取参考
完成到一半
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from bs4 import BeautifulSoup def get_url_list(url): content = requests.get(url).content soup = BeautifulSoup(content,‘lxml‘) url_list = [] # urls = soup.find(‘div‘,{‘id‘:‘list‘}).find(‘dl‘).find_all(‘dd‘) urls = soup.select(‘#list > dl > dd > a‘) for i in urls: i = i.get(‘href‘) print(i) i = ‘http://www.biquge.com.tw‘ + i url_list.append(i) print (url_list) return url_list def get_data(url): content = requests.get(url).content soup = BeautifulSoup(content, ‘lxml‘) f = open(r‘D:\Test EXCE1\HMXX.txt‘,‘a+‘,encoding=‘utf-8‘) text_name = soup.find(‘div‘,{‘class‘:‘bookname‘}).find(‘h1‘).text print(text_name) if __name__ ==‘__main__‘: url = ‘http://www.biquge.com.tw/18_18049/8057787.html‘ # get_url_list(url) get_data(url)