纯golang爬虫实战-(五-小结篇)
Posted pu369
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了纯golang爬虫实战-(五-小结篇)相关的知识,希望对你有一定的参考价值。
对前几篇文章的代码进行梳理,形成4个通用型函数:
1 直接Get或Post,通常会被网站限制访问;
2 带headers进行Get或Post,模拟了浏览器,通常可以正常访问。
代码(注意由于下面的代码中设置http header时有*/*,造成代码的显示不太正常,但不影响):
//Header是直接从chrome console中复制的view source形式的Request Headers,注意只包括以冒号分割的内容。 //FormData也是直接从chrome console中复制的view source形式的Form Data //1:Get方式:ms.Get() 需要先设置ms.Url //2: GetWitdHeader方式:ms.GetWitdHeader() 需要先设置ms.Url, Header //3: Post方式:ms.Post() 需要先设置ms.Url, FormData //4: PostWitdHeader方式:ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData //如遇GBK乱码,请参考https://www.cnblogs.com/pu369/p/12228659.html package main import ( "fmt" "io/ioutil" "net/http" "net/http/cookiejar" "strings" "golang.org/x/text/encoding/simplifiedchinese" ) type MySpider struct { Url, Header, FormData string Client *http.Client } func main() { //爬虫实例 ms := NewMySpider() //访问首页 ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp" fmt.Println(ms.Get()) //上一行代码未设置header就直接GET时,网页返回包含XSS的字符串。而直接用浏览器可以正常访问,说明需要设置header ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp" ms.Header = `Host: 192.168.132.80 Connection: keep-alive Pragma: no-cache Cache-Control: no-cache Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cookie: JSESSIONID=abcIswHnk9uU49ql9MP2w; testBanCookie=test; loginfileweaver=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D; languageidweaver=7; loginidweaver=114` fmt.Println(ms.GetWitdHeader()) //上一行代码设置header后再GET,就可以返回与浏览器访问相同的网页源代码 //访问登录页 //网站Form表单对应的是:http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,然而做了302跳转到了 //http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,用这两个Url做POST访问的效果都是一样的。 ms.Url = "http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3" ms.FormData = `loginfile=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D&logintype=1&fontName=%CE%A2%EF%BF%BD%EF%BF%BD%EF%BF%BD%C5%BA%EF%BF%BD&message=16&gopage=&formmethod=post&rnd=&serial=&username=&isie=true&loginid=admin&userpassword=1234&submit=` fmt.Println(ms.Post()) //上一行代码未携带header就直接POST时,网页返回包含XSS的字符串。说明需要携带header才能访问 //前面已设置了Url,Header,FormData,所以下面直接用ms.PostWitdHeader(),证明是可以登录成功的 fmt.Println(ms.PostWitdHeader()) //现在访问登录后才允许访问的页面,证明无需携带Header也可以访问 ms.Url = "http://192.168.132.80/CRM/data/CustomerBrowser.jsp?splitflag=" fmt.Println(ms.Get()) } //1:Get方式,ms.Get() 需要先设置ms.Url func (this MySpider) Get() string { resp, err := this.Client.Get(this.Url) defer resp.Body.Close() reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body) body, err := ioutil.ReadAll(reader) if err != nil { // handle error } return string(body) } //2: GetWitdHeader方式,ms.GetWitdHeader() 需要先设置ms.Url, Header func (this MySpider) GetWitdHeader() string { req, err := http.NewRequest("GET", this.Url, nil) if err != nil { // handle error } //将传入的Header分割成[]ak和[]av a := strings.Split(this.Header, " ") ak := make([]string, len(a[:])) av := make([]string, len(a[:])) //要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av copy(ak, a[:]) copy(av, a[:]) //fmt.Println(ak[0], av[0]) for k, v := range ak { i := strings.Index(v, ":") j := i + 1 ak[k] = v[:i] av[k] = v[j:] //设置Header req.Header.Set(ak[k], av[k]) } resp, err := this.Client.Do(req) defer resp.Body.Close() reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body) body, err := ioutil.ReadAll(reader) if err != nil { // handle error } return string(body) } //3: Post方式,ms.Post() 需要先设置ms.Url, FormData func (this MySpider) Post() string { resp, err := this.Client.Post(this.Url, "application/x-www-form-urlencoded", strings.NewReader(this.FormData)) defer resp.Body.Close() reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body) body, err := ioutil.ReadAll(reader) if err != nil { // handle error } return string(body) } //4: PostWitdHeader方式,ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData func (this MySpider) PostWitdHeader() string { req, err := http.NewRequest("POST", this.Url, strings.NewReader(this.FormData)) if err != nil { // handle error } //将传入的Header分割成[]ak和[]av a := strings.Split(this.Header, " ") ak := make([]string, len(a[:])) av := make([]string, len(a[:])) //要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av copy(ak, a[:]) copy(av, a[:]) //fmt.Println(ak[0], av[0]) for k, v := range ak { i := strings.Index(v, ":") j := i + 1 ak[k] = v[:i] av[k] = v[j:] //设置Header req.Header.Set(ak[k], av[k]) } resp, err := this.Client.Do(req) defer resp.Body.Close() reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body) body, err := ioutil.ReadAll(reader) if err != nil { // handle error } return string(body) } //构造函数 func NewMySpider() *MySpider { var Client http.Client jar, err := cookiejar.New(nil) if err != nil { panic(err) } Client.Jar = jar return &MySpider{ Client: &Client, } }
以上是关于纯golang爬虫实战-(五-小结篇)的主要内容,如果未能解决你的问题,请参考以下文章