纯golang爬虫实战(二)

Posted pu369

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了纯golang爬虫实战(二)相关的知识,希望对你有一定的参考价值。

接上一篇文章https://www.cnblogs.com/pu369/p/12202845.html只讲了原理,抽时间写个了实用版,将员工信息爬取到一个TXT文档中,以便于查询,上代码:

//纯golang爬虫
package main

import (
    "bytes"
    "fmt"
    "io/ioutil"
    "net/http"
    "net/http/cookiejar"
    "regexp"
    "strings"
)

type MySpider struct {
    indexUrl string
    cleint   *http.Client
    buf      *bytes.Buffer
}

//登录,用GET代替POST请求
func (this MySpider) login() (string, error) {
    //访问首页
    resp, err := this.cleint.Get(this.indexUrl)
    //访问登录页
    resp, err = this.cleint.Get("http://192.168.13.1:8080/login/auth?name=XX&password=XX&scurity=s&type=0&typeField=0")
    body, err := ioutil.ReadAll(resp.Body)
    defer resp.Body.Close()
    if err != nil {
        return "err", err
    }
    //fmt.Print(string(body))
    //trimbody := []byte(trimhtml(string(body)))
    //循环访问
    this.saveall()
    return string(body), err
}

//循环下载所有内容、保存到bytes.buffer,最后统一写入文件(如果数据量太大可能会崩溃)
func (this MySpider) saveall() (string, error) {
    //最小id是2,但id是8位字符串,不足8位在前面补0
    for id := 2; id < 20000; id++ {
        idstr := fmt.Sprintf("%08d", id)
        //fmt.Println(idstr)

        //员工记录主页面
        url := "http://192.168.13.1:8080/browse/basicinfo_p.jsp?rtpage=psnfrm&pid=" + idstr + "&func=0297&userbase=Usr"
        this.saveone(url, idstr)
    }
    //fmt.Print("buf:", this.buf.String())
    //保存到文件
    err := ioutil.WriteFile("hrp.txt", this.buf.Bytes(), 0644)
    if err != nil {
        return "err", err
    }
    return "", err
}

//下载某人员的主页面内容、保存到bytes.buffer
func (this MySpider) saveone(url, idstr string) (string, error) {
    resp, err := this.cleint.Get(url)
    if err != nil {
        return "err", err
    }
    body, err := ioutil.ReadAll(resp.Body)
    defer resp.Body.Close()
    stringbody := string(body)
    //fmt.Print(string(body))
    //判断主页面是否包含字样:Apache Tomcat/5.0.19 - Error report |  HTTP Status 500
    if ko := strings.Contains(stringbody, "Apache"); !ko {
        //主页面正常,则保存
        this.buf.Write([]byte(idstr + "
"))
        trimbody := []byte(trimHtml(stringbody))
        this.buf.Write(trimbody)
        this.buf.Write([]byte("
"))
        //有主页面,则下载辅助页面
        //员工记录附加页面-1学历
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A04&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-2岗位
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A17&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-3简历
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A19&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-4合同
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZ3&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-5流动
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A16&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-6关系
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A79&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-7家庭
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A82&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-8聘任
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZT&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-9职务
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A07&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-10专业
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A10&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-11工人
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A13&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-12奖励
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A28&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        //员工记录附加页面-13惩罚
        url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A29&pid=" + idstr + "&userbase=Usr"
        this.saveonemore(url)
        this.buf.Write([]byte("

"))
    }
    return "", err
}

//下载某人员的辅助页面内容、保存到bytes.buffer
func (this MySpider) saveonemore(url string) (string, error) {
    resp, err := this.cleint.Get(url)
    if err != nil {
        return "err", err
    }
    body, err := ioutil.ReadAll(resp.Body)
    defer resp.Body.Close()
    stringbody := string(body)
    trimbody := []byte(trimHtml(stringbody))
    this.buf.Write(trimbody)
    this.buf.Write([]byte("
"))
    return "", err
}

//去除Html标签
func trimHtml(src string) string {
    //将HTML标签全转换成小写
    re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
    src = re.ReplaceAllStringFunc(src, strings.ToLower)
    //去除STYLE
    re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
    src = re.ReplaceAllString(src, " ")
    //去除SCRIPT
    re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
    src = re.ReplaceAllString(src, " ")
    //去除所有尖括号内的HTML代码,并换成换行符
    re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
    src = re.ReplaceAllString(src, " ")
    //去除&nbsp
    re, _ = regexp.Compile("&nbsp")
    src = re.ReplaceAllString(src, " ")
    //去除连续的换行符
    re, _ = regexp.Compile("\\s{2,}")
    src = re.ReplaceAllString(src, " |  ")
    return strings.TrimSpace(src)
}

//运行
func (this MySpider) run() string {
    //生成可复用的client
    var client http.Client
    jar, err := cookiejar.New(nil)
    if err != nil {
        panic(err)
    }
    client.Jar = jar
    this.cleint = &client
    //登录,用GET代替POST请求
    this.login()
    return ""
}

func main() {
    //爬虫实例
    ms := new(MySpider)
    //入口地址http://192.168.13.1:8080
    ms.indexUrl = "http://192.168.13.1:8080"
    ms.buf = bytes.NewBuffer([]byte{})
    ms.run()
}

以上是关于纯golang爬虫实战(二)的主要内容,如果未能解决你的问题,请参考以下文章

纯golang爬虫实战(三)

纯golang爬虫实战--使用mime/multipart传输附件

轻松搞定Golang爬虫实战(文末有资源哦~)

golang代码片段(摘抄)

基于golang的爬虫实战

纯手工打造简单分布式爬虫(Python)