爬取当当网的图书信息之封装一个工具类
Posted 王起帆
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取当当网的图书信息之封装一个工具类相关的知识,希望对你有一定的参考价值。
把这个类名取为Tool
封装一个下载网页的方法Gethtml
public static string GetHtml(string url) { try { WebClient wb = new WebClient(); return wb.DownloadString(url); } catch { return ""; } }
传入的是这个网页的URL,这个方法能帮我们把网页下载下来
封装一个匹配图书类URL的的方法
public static ArrayList GetList(string html) { ArrayList list = new ArrayList(); MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html"); for (int i = 0; i < matches.Count; i++) { if (!list.Contains(matches[i].Value.ToString()))//去重 { list.Add(matches[i].Value.ToString()); } } return list; }
这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
封装一个一个获取图书类名的方法
public static string GetBookClassName(string html) { // <meta name="keywords" content="计算机/网络,家庭与办公室用书" /> //计算机/网络、家庭与办公室用书等商品 string name = ""; MatchCollection matches = Regex.Matches(html, "<meta name=\\"keywords\\" content=\\".{1,30}\\" />"); if (matches.Count>0) { string temp= matches[0].ToString(); int x= temp.IndexOf("/"); int y = temp.LastIndexOf(">"); if (y-x>4) { name = temp.Substring(x + 1, y - x - "\\" />".Length); } } return name; }
查看网页的源代码
<meta name="keywords" content="计算机/网络,家庭与办公室用书" />
图书类名就在这里 接着我们使用正则把它抓取到
接下来我们要抓取每个图书类别共有多少页
public static int GetPages(string html) { int result = 1; MatchCollection matches = Regex.Matches(html, "<li class=\\"page_input\\"><span>共[0-9]{1,4}页 到第</span>"); if (matches.Count > 0) { string temp = matches[0].ToString(); int y1 = temp.IndexOf("共", 0); int y2 = temp.IndexOf("页", y1); if (y1>0&&y2>0) { string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length); result = int.Parse(page); } } return result; }
处理好BookClass接下来处理Book了
获取图书详细页面的URL
public static ArrayList GetProduct(string html) { //http://product.dangdang.com/22862060.html ArrayList list = new ArrayList(); MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html"); for (int i = 0; i < matches.Count; i++) { Console.WriteLine(matches[i].Value.ToString()); if (!list.Contains(matches[i].Value.ToString())) list.Add(matches[i].Value.ToString()); } return list; }
封装一个方法,待爬虫获取图书详细页来抓取图书信息
以如何抓取价格信息为例
<div class="price_pc" id="pc-price"> <div class="price_d"> <p class="t" id="dd-price-text">当当价</p> <p id="dd-price"> <span class="yen">¥</span>66.40 </p> </div> <div class="price_zhe" id="dd-zhe"></div> <div class="price_m price_m_t" id="original-price-text">定价</div> <div class="price_m" id=\'original-price\'> <span class="yen">¥</span>99.00 </div> <div class="price_vip" style="display:none" id="dd-vip"> <span></span> </div> </div> </div>
66.40是我们需要匹配出来的数据,数据特征并不是很明显,直接匹配会出现杂乱的数据,我们先抓取稍大范围的,缩小搜索范围再来寻找
MatchCollection matches = Regex.Matches(html, " <span class=\\"yen\\">¥</span>.{1,4}.[0-9]{2}");
缩小爬虫抓取范围后,借助Indexof来搜索到
if (matches.Count > 0) { string temp = matches[0].ToString(); int y1 = temp.IndexOf("</span>", 0); if (y1>0) price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length); }
嘿嘿 价格信息就这样抓取到了,其他的不详细介绍
public static Dictionary<int, string> analysis(string html) { string BookName = ""; string price = "0"; string author = ""; string publisher = ""; string imgurl = ""; string Content = ""; Dictionary<int, string> dictionary = new Dictionary<int, string>(); MatchCollection matches = Regex.Matches(html, " <span class=\\"yen\\">¥</span>.{1,4}.[0-9]{2}"); if (matches.Count > 0) { string temp = matches[0].ToString(); int y1 = temp.IndexOf("</span>", 0); if (y1>0) price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length); } matches = Regex.Matches(html, "<title>.*</title>"); if (matches.Count > 0) { string thtml = matches[0].ToString(); int n1 = thtml.IndexOf("《", 0); if (n1 > 0) { int n2 = thtml.IndexOf("》", n1); if (n2 > n1) BookName = thtml.Substring(n1 + 1, n2 - n1 - 1); else { n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1); if (n2 > n1) BookName = thtml.Substring(n1 + 1, n2 - n1 - 1); } } } //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1" target="_blank" >@秋叶</a> //>作者:<a href="http://www.dangdang.com/author/Marty_1" target="_blank" >Marty</a> int a1 = html.IndexOf("target=\\"_blank\\" dd_name=\\"作者\\">",0); if (a1 > 0) { int a2 = html.IndexOf("</a>", a1); if (a2>a1) { author = html.Substring(a1 + "target=\\"_blank\\" dd_name=\\"作者\\">".Length, a2 - a1 - "target=\\"_blank\\" dd_name=\\"作者\\">".Length); } } // target="_blank" dd_name="出版社"> int p1 = html.IndexOf("target=\\"_blank\\" dd_name=\\"出版社\\">", 0); if (p1 > 0) { int p2 = html.IndexOf("</a>", p1); if (p2>0) { publisher = html.Substring(p1 + "target=\\"_blank\\" dd_name=\\"出版社\\">".Length, p2 - p1 - "target=\\"_blank\\" dd_name=\\"出版社\\">".Length); } } //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800"> // <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800"> // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800"> matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg"); if (matches.Count > 0) { imgurl = matches[0].ToString(); } //content int c1 = html.IndexOf("<meta name=\\"description\\" content=\\""); if (c1>0) { int c2 = html.IndexOf("\\">", c1); if (c2>0) { Content = html.Substring(c1 + "<meta name=\\"description\\" content=\\"".Length, c2 - c1 - "<meta name=\\"description\\" content=\\"".Length); } } dictionary.Add(1, BookName); dictionary.Add(2, price); dictionary.Add(3, author); dictionary.Add(4, publisher); dictionary.Add(5, imgurl); dictionary.Add(6, Content); return dictionary; }
Tool类完成
以上是关于爬取当当网的图书信息之封装一个工具类的主要内容,如果未能解决你的问题,请参考以下文章
用python的xpath定位textarea爬取不下来是啥原因,一直是空,比如当当网图书的目录标签就是textarea?