go爬虫框架的Colly框架使用

Posted 2021-08-05 go研究所

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了go爬虫框架的Colly框架使用相关的知识，希望对你有一定的参考价值。

package main
import ( "fmt" "github.com/gocolly/colly")
func main() { c := colly.NewCollector()
 c.Onhtml("a", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) })
 c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL) })
 c.Visit("https://docs.dbsgw.com/")}

2.爬取博客首页和文章页

package main
import ( "fmt" "github.com/gocolly/colly" "strings" "time")
func main() {
 c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
 c1 := c.Clone() c1.Async = true //限速 c1.Limit(&colly.LimitRule{ DomainRegexp: "", DomainGlob: "*.jianshu.com/p/*", Delay: 10 * time.Second, RandomDelay: 0, Parallelism: 1, }) // 首页提取链接 c.OnHTML("body > section > div.sidebar > div.widget.widget_ui_tags.wow.article-categories.fadeInUp > div", func(e *colly.HTMLElement) {
 e.ForEach("a", func(i int, element *colly.HTMLElement) { href := element.Attr("href") if strings.Contains(href, "http") { fmt.Println(href,">>>>>包含",i) c1.Request("GET",href,nil,nil,nil)
 }else { fmt.Println(element.Text,"<<<<<<不包含") } })
 })
 // 从链接里面 异步提取详情 c1.OnHTML("body > section > div.content-wrap > div > article.excerpt.excerpt-1.wow.fadeInUp > header > h2 > a", func(e *colly.HTMLElement) { fmt.Println(e.Text)
 })
 err := c.Visit("https://blog.dbsgw.com/") if err != nil { fmt.Println(err.Error()) } c1.Wait()}

3.爬取需要登录的网页(通过设置cookie实现登录)

package main
import ( "fmt" "github.com/gocolly/colly" "github.com/gocolly/colly/debug" "github.com/gocolly/colly/extensions" _ "github.com/gocolly/colly/extensions" "net/http")
func main() { url := "https://blog.dbsgw.com/admin/"
 c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{})) c.OnHTML("#admindex_servinfo > ul", func(e *colly.HTMLElement) { fmt.Println(e.Text,"---------------") })
 //设置随机useragent extensions.RandomUserAgent(c) //设置登录cookie c.SetCookies(url, []*http.Cookie{ &http.Cookie{ Name: "phpSESSID", Value: "3ur579rq6lindrkolomq24br17", Path: "/", Domain: "blog.dbsgw.com", Secure: true, HttpOnly: true, }, }) c.SetCookies(url, []*http.Cookie{ &http.Cookie{ Name: "EM_AUTHCOOKIE_95yUDyuGoTu7WYejbLXmH8m6O630lMsd", Value: "admin%7C%7Ca7e91e464d86db572e6e588ef9dd5815", Path: "/", Domain: "blog.dbsgw.com", Secure: true, HttpOnly: true, }, })

 c.OnRequest(func(r *colly.Request) { fmt.Println("爬取页面：", r.URL) })
 c.OnError(func(r *colly.Response, err error) { fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) }) err := c.Visit(url) if err != nil { fmt.Println(err.Error()) }}

以上是关于go爬虫框架的Colly框架使用的主要内容，如果未能解决你的问题，请参考以下文章