纯golang爬虫实战(二)
Posted pu369
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了纯golang爬虫实战(二)相关的知识,希望对你有一定的参考价值。
接上一篇文章https://www.cnblogs.com/pu369/p/12202845.html只讲了原理,抽时间写个了实用版,将员工信息爬取到一个TXT文档中,以便于查询,上代码:
//纯golang爬虫 package main import ( "bytes" "fmt" "io/ioutil" "net/http" "net/http/cookiejar" "regexp" "strings" ) type MySpider struct { indexUrl string cleint *http.Client buf *bytes.Buffer } //登录,用GET代替POST请求 func (this MySpider) login() (string, error) { //访问首页 resp, err := this.cleint.Get(this.indexUrl) //访问登录页 resp, err = this.cleint.Get("http://192.168.13.1:8080/login/auth?name=XX&password=XX&scurity=s&type=0&typeField=0") body, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if err != nil { return "err", err } //fmt.Print(string(body)) //trimbody := []byte(trimhtml(string(body))) //循环访问 this.saveall() return string(body), err } //循环下载所有内容、保存到bytes.buffer,最后统一写入文件(如果数据量太大可能会崩溃) func (this MySpider) saveall() (string, error) { //最小id是2,但id是8位字符串,不足8位在前面补0 for id := 2; id < 20000; id++ { idstr := fmt.Sprintf("%08d", id) //fmt.Println(idstr) //员工记录主页面 url := "http://192.168.13.1:8080/browse/basicinfo_p.jsp?rtpage=psnfrm&pid=" + idstr + "&func=0297&userbase=Usr" this.saveone(url, idstr) } //fmt.Print("buf:", this.buf.String()) //保存到文件 err := ioutil.WriteFile("hrp.txt", this.buf.Bytes(), 0644) if err != nil { return "err", err } return "", err } //下载某人员的主页面内容、保存到bytes.buffer func (this MySpider) saveone(url, idstr string) (string, error) { resp, err := this.cleint.Get(url) if err != nil { return "err", err } body, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() stringbody := string(body) //fmt.Print(string(body)) //判断主页面是否包含字样:Apache Tomcat/5.0.19 - Error report | HTTP Status 500 if ko := strings.Contains(stringbody, "Apache"); !ko { //主页面正常,则保存 this.buf.Write([]byte(idstr + " ")) trimbody := []byte(trimHtml(stringbody)) this.buf.Write(trimbody) this.buf.Write([]byte(" ")) //有主页面,则下载辅助页面 //员工记录附加页面-1学历 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A04&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-2岗位 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A17&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-3简历 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A19&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-4合同 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZ3&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-5流动 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A16&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-6关系 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A79&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-7家庭 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A82&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-8聘任 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=AZT&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-9职务 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A07&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-10专业 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A10&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-11工人 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A13&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-12奖励 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A28&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) //员工记录附加页面-13惩罚 url = "http://192.168.13.1:8080/browse/subsetinfo_p.jsp?ref=A29&pid=" + idstr + "&userbase=Usr" this.saveonemore(url) this.buf.Write([]byte(" ")) } return "", err } //下载某人员的辅助页面内容、保存到bytes.buffer func (this MySpider) saveonemore(url string) (string, error) { resp, err := this.cleint.Get(url) if err != nil { return "err", err } body, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() stringbody := string(body) trimbody := []byte(trimHtml(stringbody)) this.buf.Write(trimbody) this.buf.Write([]byte(" ")) return "", err } //去除Html标签 func trimHtml(src string) string { //将HTML标签全转换成小写 re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllStringFunc(src, strings.ToLower) //去除STYLE re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>") src = re.ReplaceAllString(src, " ") //去除SCRIPT re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>") src = re.ReplaceAllString(src, " ") //去除所有尖括号内的HTML代码,并换成换行符 re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllString(src, " ") //去除  re, _ = regexp.Compile(" ") src = re.ReplaceAllString(src, " ") //去除连续的换行符 re, _ = regexp.Compile("\\s{2,}") src = re.ReplaceAllString(src, " | ") return strings.TrimSpace(src) } //运行 func (this MySpider) run() string { //生成可复用的client var client http.Client jar, err := cookiejar.New(nil) if err != nil { panic(err) } client.Jar = jar this.cleint = &client //登录,用GET代替POST请求 this.login() return "" } func main() { //爬虫实例 ms := new(MySpider) //入口地址http://192.168.13.1:8080 ms.indexUrl = "http://192.168.13.1:8080" ms.buf = bytes.NewBuffer([]byte{}) ms.run() }
以上是关于纯golang爬虫实战(二)的主要内容,如果未能解决你的问题,请参考以下文章