爬虫大作业

Posted VersonPENG

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫大作业相关的知识,希望对你有一定的参考价值。

代码:

#encoding=utf-8
import re
import requests
import urllib2
import datetime
import mysqldb
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class Splider(object):
  def __init__(self):
  print u\'开始爬取内容...\'
  ##用来获取网页源代码
  def getsource(self,url):
  headers = {\'User-Agent\':\'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/50.0.2652.0 Safari/537.36\'}
  req = urllib2.Request(url=url,headers=headers)
  socket = urllib2.urlopen(req)
  content = socket.read()
  socket.close()
  return content
  ##changepage用来生产不同页数的链接
  def changepage(self,url,total_page):
    now_page = int(re.search(\'page/(\\d+)\',url,re.S).group(1))
  page_group = []
  for i in range(now_page,total_page+1):
    link = re.sub(\'page/(\\d+)\',\'page/%d\' % i,url,re.S)
    page_group.append(link)
  return page_group
  #获取字内容
  def getchildrencon(self,child_url):
  conobj = {}
  content = self.getsource(child_url)
  soup = BeautifulSoup(content, \'html.parser\', from_encoding=\'utf-8\')
  content = soup.find(\'div\',{\'class\':\'c-article_content\'})
  img = re.findall(\'src="(.*?)"\',str(content),re.S)
  conobj[\'con\'] = content.get_text()
  conobj[\'img\'] = (\';\').join(img)
  return conobj
  ##获取内容
  def getcontent(self,html_doc):
  soup = BeautifulSoup(html_doc, \'html.parser\', from_encoding=\'utf-8\')
  tag = soup.find_all(\'div\',{\'class\':\'promo-feed-headline\'})
  info = {}
  i = 0
  for link in tag:
    info[i] = {}
    title_desc = link.find(\'h3\')
    info[i][\'title\'] = title_desc.get_text()
    post_date = link.find(\'div\',{\'class\':\'post-date\'})
    pos_d = post_date[\'data-date\'][0:10]
    info[i][\'content_time\'] = pos_d
    info[i][\'source\'] = \'whowhatwear\'
    source_link = link.find(\'a\',href=re.compile(r"section=fashion-trends"))
    source_url = \'http://www.whowhatwear.com\'+source_link[\'href\']
    info[i][\'source_url\'] = source_url
    in_content = self.getsource(source_url)
    in_soup = BeautifulSoup(in_content, \'html.parser\', from_encoding=\'utf-8\')
    soup_content = in_soup.find(\'section\',{\'class\':\'widgets-list-content\'})
    info[i][\'content\'] = soup_content.get_text().strip(\'\\n\')
    text_con = in_soup.find(\'section\',{\'class\':\'text\'})
    summary = text_con.get_text().strip(\'\\n\') if text_con.text != None else NULL
    info[i][\'summary\'] = summary[0:200]+\'...\';
    img_list = re.findall(\'src="(.*?)"\',str(soup_content),re.S)
    info[i][\'imgs\'] = (\';\').join(img_list)
    info[i][\'create_time\'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    i+=1
  #print info
  #exit()
  return info
  def saveinfo(self,content_info):
  conn = MySQLdb.Connect(host=\'127.0.0.1\',user=\'root\',passwd=\'123456\',port=3306,db=\'test\',charset=\'utf8\')
  cursor = conn.cursor()
  for each in content_info:
    for k,v in each.items():
    sql = "insert into t_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')" % (MySQLdb.escape_string(v[\'title\']),MySQLdb.escape_string(v[\'summary\']),MySQLdb.escape_string(v[\'content\']),v[\'content_time\'],v[\'imgs\'],v[\'source\'],v[\'source_url\'],v[\'create_time\'])
    cursor.execute(sql)
  conn.commit()
  cursor.close()
  conn.close()
if __name__ == \'__main__\':
  classinfo = []
  p_num = 5
  url = \'http://china.nba.com/rockets/\'
  jikesplider = Splider()
  all_links = jikesplider.changepage(url,p_num)
  for link in all_links:
  print u\'正在处理页面:\' + link
  html = jikesplider.getsource(link)
  info = jikesplider.getcontent(html)
  classinfo.append(info)
  jikesplider.saveinfo(classinfo)

 

截图:

 

以上是关于爬虫大作业的主要内容,如果未能解决你的问题,请参考以下文章

爬虫大作业

爬虫大作业

HTML5期末大作业:餐饮美食网站设计——咖啡(10页) HTML+CSS+JavaScript 学生DW网页设计作业成品 web课程设计网页规划与设计 咖啡网页设计 美食餐饮网页设计...(代码片段

Python大作业——爬虫+可视化+数据分析+数据库(可视化篇)

Python大作业——爬虫+可视化+数据分析+数据库(数据分析篇)

Python课程设计大作业:利用爬虫获取NBA比赛数据并进行机器学习预测NBA比赛结果