踩了很多坑,主要是python2编码的问题和正则不熟
直接上脚本
# -*- coding: gbk -*- import re import urllib2 import time class spider: """ 龙族 句子迷 """ def __init__(self): self.enable = True self.page = 1 def load_page(self,page): url = ‘http://www.juzimi.com/article/113093?page=‘+str(page) user_agent=‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘ headers = {"User-Agent":user_agent} req = urllib2.Request(url,headers = headers) response = urllib2.urlopen(req) html = response.read() new_html = html.decode(‘utf-8‘) # print new_html #正则过滤得到句子 pattern = re.compile(r‘class="xlistju">(.*?)</a>‘,re.S) item_list=pattern.findall(new_html) # print item_list return item_list def deal_one_page(self,item_list,page): print ‘第%d页‘ %(page) for item in item_list: item = item.replace("<br />", "") item = item.replace("<br/>","") self.write_to_file(item) print ‘%d‘ %(page) def write_to_file(self,txt): f=open(‘F:\py\longzu.txt‘,‘a‘) f.write(txt.encode(‘utf-8‘)) f.write(‘\n\n‘) f.close() def do_work(self): while self.enable: for i in range(1,34): item_list = self.load_page(self.page) self.deal_one_page(item_list,self.page) time.sleep(5) self.page += 1 if i==33 : print ‘结束‘ self.enable = False break if __name__ == "__main__": longspider = spider() longspider.do_work()