python 爬虫练习

Posted 软件测试杂谈

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫练习相关的知识,希望对你有一定的参考价值。

bs去除特定标签。

# url
import easygui as g
import urllib.request
from bs4 import BeautifulSoup
import os
import sys
import re
import config.story2 as urls

# 获取url
def set_url():

    msg = "请填写一下信息(其中带*号的项为必填项)"
    title = "爬虫练习"
    fieldNames = ["*小说目录地址", "*组装前半段", "后半段"]
    fieldValues = []
    fieldValues = g.multenterbox(msg, title, fieldNames)
    while True:
        if fieldValues == None:
            break
        errmsg = ""
        for i in range(len(fieldNames)):
            option = fieldNames[i].strip()
            if fieldValues[i].strip() == "" and option[0] == "*":
                errmsg += ("【%s】为必填项   " % fieldNames[i])
        if errmsg == "":
            break
        fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues)

    return fieldValues


# 下载网页内容,找到文章标题和对应的下载路径
def get_urls(seed_url,pre_url,last_url):
    # 保存文章名称和地址
    storyList = {}
    response = urllib.request.urlopen(seed_url)
    html = response.read().decode(utf-8)
    bs = BeautifulSoup(html, "html.parser")
    contents = bs.find_all("div", {"class": "c-line-bottom"})
    for each in contents:
        # 或者文章的data-nsrc属性
        nsrc = each.a["data-nsrc"]
        #组装url
        seed_url = pre_url+nsrc+last_url
        # 获取文件标题
        title = each.p.string
        storyList[title] = seed_url

    return storyList

# 获取每个小说并下载
def getStory():
    savepath = r"E:\\stories\\"
    storyList = get_urls(urls.url1,urls.url2,urls.url3)
    storyNames = list(storyList.keys())
    for i in range(len(storyNames)):
        # 获取小说:
        html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode(utf-8)
        bs = BeautifulSoup(html,"html.parser")

        [s.extract() for s in bs(br)]   # 后来发现这个可以啊
        content = bs.find_all(p)
        #[ss.extract() for ss in content(‘p‘)]  # 放到这里是否可以,发现不行。TypeError: ‘ResultSet‘ object is not callable
        # # 用替换方式去掉br修饰,发现不行
        # oldstr = r‘<br style="font-size:16px;font-weight:normal;‘ \
        #          r‘margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);‘ \
        #          r‘text-align:-webkit-auto;text-indent:0px;white-space:normal;‘ \
        #          r‘text-overflow:clip;clear:none;display:inline;"/>‘
        #

       # print(content)

        with open(savepath+storyNames[i]+".txt",w) as f:
             f.writelines(str(content))

# download(get_url())
# get_url()
getStory()

 

以上是关于python 爬虫练习的主要内容,如果未能解决你的问题,请参考以下文章

scrapy主动退出爬虫的代码片段(python3)

Python爬虫练习:爬取美团网成都地区的酒店信息

Python爬虫练习(拉勾网北京地区数据挖掘类职位所需技能统计)

python爬虫练习18:爬虫抓取视频思路2

python爬虫入门练习,使用正则表达式和requests爬取LOL官网皮肤

python 爬虫练习