企业🤖AI Agent构建引擎,智能编排和调试,一键部署,支持私有化部署方案 广告
[TOC] 参考:https://blog.csdn.net/weixin_43968923/article/details/108994707 ## 概述 gocolly是golang语言开发的爬虫包,通过gocolly来进行模拟网站的登录, 目标已经确立,开始行动。 ### 下载 ``` go get -u github.com/gocolly/colly ``` ## cookie登录 登录网站:https://www.bilibili.com/ 获取cookie: * google浏览器访问`bilibili` * 按F12 * 查看`NetWork`选项 * 点击`Doc` * 查看 cookie信息 ## colly爬虫代码流程 初始化一个`Collector`收集器 ``` c := colly.NewCollector( colly.AllowedDomains("www.bilibili.com"), colly.AllowURLRevisit(), colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36")) ``` 爬取网站的规则设置: ``` err := c.Limit(&colly.LimitRule{ // Filter domains affected by this rule // 筛选受此规则影响的域 DomainGlob: "bilibili.com/*", // Set a delay between requests to these domains // 设置对这些域的请求之间的延迟 Delay: 1 * time.Second, // Add an additional random delay // 添加额外的随机延迟 RandomDelay: 1 * time.Second, // 设置并发 Parallelism: 5, }) ``` 开始爬取 ``` err = c.Visit(url) c.Wait() ``` 代码示例 ``` package main import ( "fmt" "github.com/gocolly/colly" "net/http" "os" "strings" "time" ) /* 请求执行之前调用 - OnRequest 响应返回之后调用 - OnResponse 监听执行 selector - OnHTML 监听执行 selector - OnXML 错误回调 - OnError 完成抓取后执行,完成所有工作后执行 - OnScraped 取消监听,参数为 selector 字符串 - OnHTMLDetach 取消监听,参数为 selector 字符串 - OnXMLDetach */ // set cookies raw func setCookieRaw(cookieRaw string) []*http.Cookie { // 可以添加多个cookie var cookies []*http.Cookie cookieList := strings.Split(cookieRaw, "; ") for _, item := range cookieList { keyValue := strings.Split(item, "=") // fmt.Println(keyValue) name := keyValue[0] valueList := keyValue[1:] cookieItem := http.Cookie{ Name: name, Value: strings.Join(valueList, "="), } cookies = append(cookies, &cookieItem) } return cookies } func main() { c := colly.NewCollector( colly.AllowedDomains("www.bilibili.com"), colly.AllowURLRevisit(), colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36")) err := c.Limit(&colly.LimitRule{ // Filter domains affected by this rule // 筛选受此规则影响的域 DomainGlob: "bilibili.com/*", // Set a delay between requests to these domains // 设置对这些域的请求之间的延迟 Delay: 1 * time.Second, // Add an additional random delay // 添加额外的随机延迟 RandomDelay: 1 * time.Second, // 设置并发 Parallelism: 5, }) if err != nil { fmt.Println("fad:", err) } header := map[string]string{ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Connection": "keep-alive", "Host": "https://www.bilibili.com/", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36", } url := "https://www.bilibili.com/" // 在提出请求之前打印 "访问…" c.OnRequest(func(r *colly.Request) { for key, value := range header { r.Headers.Add(key, value) } fmt.Println("url: ", r.URL.String()) cookie := "xxx" err := c.SetCookies(url, setCookieRaw(cookie)) if err != nil { fmt.Println("fad:", err) } }) c.OnHTML("a[href]", func(e *colly.HTMLElement) { target := e.Attr("target") name := e.ChildText("span") if len(name) > 0 { fmt.Println("name: ", name) if target == "_blank" && name == "动态"{ link := e.Attr("href") fmt.Println("link: ", link) } } }) err = c.Visit(url) if err != nil { fmt.Errorf("fffffff %s\n", err.Error()) os.Exit(-1) } c.Wait() fmt.Println("程序结束") } ```