纯golang爬虫实战-(五-小结篇)

Posted pu369

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了纯golang爬虫实战-(五-小结篇)相关的知识,希望对你有一定的参考价值。

对前几篇文章的代码进行梳理,形成4个通用型函数:

1 直接Get或Post,通常会被网站限制访问;

2 带headers进行Get或Post,模拟了浏览器,通常可以正常访问。

代码(注意由于下面的代码中设置http header时有*/*,造成代码的显示不太正常,但不影响):

//Header是直接从chrome console中复制的view source形式的Request Headers,注意只包括以冒号分割的内容。
//FormData也是直接从chrome console中复制的view source形式的Form Data
//1:Get方式:ms.Get() 需要先设置ms.Url
//2: GetWitdHeader方式:ms.GetWitdHeader() 需要先设置ms.Url, Header
//3: Post方式:ms.Post() 需要先设置ms.Url, FormData
//4: PostWitdHeader方式:ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData
//如遇GBK乱码,请参考https://www.cnblogs.com/pu369/p/12228659.html

package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/http/cookiejar"
    "strings"

    "golang.org/x/text/encoding/simplifiedchinese"
)

type MySpider struct {
    Url, Header, FormData string
    Client                *http.Client
}

func main() {
    //爬虫实例
    ms := NewMySpider()
    //访问首页
    ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp"
    fmt.Println(ms.Get())
    //上一行代码未设置header就直接GET时,网页返回包含XSS的字符串。而直接用浏览器可以正常访问,说明需要设置header
    ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp"
    ms.Header = `Host: 192.168.132.80
Connection: keep-alive
Pragma: no-cache
Cache-Control: no-cache
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: JSESSIONID=abcIswHnk9uU49ql9MP2w; testBanCookie=test; loginfileweaver=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D; languageidweaver=7; loginidweaver=114`
    fmt.Println(ms.GetWitdHeader())
    //上一行代码设置header后再GET,就可以返回与浏览器访问相同的网页源代码
    //访问登录页
    //网站Form表单对应的是:http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,然而做了302跳转到了
    //http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,用这两个Url做POST访问的效果都是一样的。
    ms.Url = "http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3"
    ms.FormData = `loginfile=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D&logintype=1&fontName=%CE%A2%EF%BF%BD%EF%BF%BD%EF%BF%BD%C5%BA%EF%BF%BD&message=16&gopage=&formmethod=post&rnd=&serial=&username=&isie=true&loginid=admin&userpassword=1234&submit=`

    fmt.Println(ms.Post())
    //上一行代码未携带header就直接POST时,网页返回包含XSS的字符串。说明需要携带header才能访问
    //前面已设置了Url,Header,FormData,所以下面直接用ms.PostWitdHeader(),证明是可以登录成功的
    fmt.Println(ms.PostWitdHeader())
    //现在访问登录后才允许访问的页面,证明无需携带Header也可以访问
    ms.Url = "http://192.168.132.80/CRM/data/CustomerBrowser.jsp?splitflag="
    fmt.Println(ms.Get())

}

//1:Get方式,ms.Get() 需要先设置ms.Url
func (this MySpider) Get() string {
    resp, err := this.Client.Get(this.Url)
    defer resp.Body.Close()
    reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
    body, err := ioutil.ReadAll(reader)
    if err != nil {
        // handle error
    }
    return string(body)
}

//2: GetWitdHeader方式,ms.GetWitdHeader() 需要先设置ms.Url, Header
func (this MySpider) GetWitdHeader() string {
    req, err := http.NewRequest("GET", this.Url, nil)
    if err != nil {
        // handle error
    }
    //将传入的Header分割成[]ak和[]av
    a := strings.Split(this.Header, "
")
    ak := make([]string, len(a[:]))
    av := make([]string, len(a[:]))
    //要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av
    copy(ak, a[:])
    copy(av, a[:])
    //fmt.Println(ak[0], av[0])
    for k, v := range ak {
        i := strings.Index(v, ":")
        j := i + 1
        ak[k] = v[:i]
        av[k] = v[j:]
        //设置Header
        req.Header.Set(ak[k], av[k])
    }
    resp, err := this.Client.Do(req)
    defer resp.Body.Close()
    reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
    body, err := ioutil.ReadAll(reader)
    if err != nil {
        // handle error
    }
    return string(body)
}

//3: Post方式,ms.Post() 需要先设置ms.Url, FormData
func (this MySpider) Post() string {
    resp, err := this.Client.Post(this.Url, "application/x-www-form-urlencoded", strings.NewReader(this.FormData))
    defer resp.Body.Close()
    reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
    body, err := ioutil.ReadAll(reader)
    if err != nil {
        // handle error
    }
    return string(body)
}

//4: PostWitdHeader方式,ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData
func (this MySpider) PostWitdHeader() string {
    req, err := http.NewRequest("POST", this.Url, strings.NewReader(this.FormData))
    if err != nil {
        // handle error
    }
    //将传入的Header分割成[]ak和[]av
    a := strings.Split(this.Header, "
")
    ak := make([]string, len(a[:]))
    av := make([]string, len(a[:]))
    //要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av
    copy(ak, a[:])
    copy(av, a[:])
    //fmt.Println(ak[0], av[0])
    for k, v := range ak {
        i := strings.Index(v, ":")
        j := i + 1
        ak[k] = v[:i]
        av[k] = v[j:]
        //设置Header
        req.Header.Set(ak[k], av[k])
    }
    resp, err := this.Client.Do(req)
    defer resp.Body.Close()
    reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
    body, err := ioutil.ReadAll(reader)
    if err != nil {
        // handle error
    }
    return string(body)
}

//构造函数
func NewMySpider() *MySpider {
    var Client http.Client
    jar, err := cookiejar.New(nil)
    if err != nil {
        panic(err)
    }
    Client.Jar = jar
    return &MySpider{
        Client: &Client,
    }
}

 

以上是关于纯golang爬虫实战-(五-小结篇)的主要内容,如果未能解决你的问题,请参考以下文章

纯golang爬虫实战(二)

纯golang爬虫实战--使用mime/multipart传输附件

轻松搞定Golang爬虫实战(文末有资源哦~)

纯手工打造简单分布式爬虫(Python)

如何入门爬虫(基础篇)

如何入门爬虫(基础篇)