基于HttpClient实现网络爬虫~以百度新闻为例

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于HttpClient实现网络爬虫~以百度新闻为例相关的知识,希望对你有一定的参考价值。

转载请注明出处:http://blog.csdn.net/xiaojimanman/article/details/40891791


      基于HttpClient4.5实现网络爬虫请訪问这里:http://blog.csdn.net/xiaojimanman/article/details/53178307

      在曾经的工作中,实现过简单的网络爬虫,没有系统的介绍过,这篇博客就系统的介绍以下怎样使用java的HttpClient实现网络爬虫。

      关于网络爬虫的一些理论知识、实现思想以及策略问题。能够參考百度百科“网络爬虫”,那里已经介绍的十分具体。这里也不再啰嗦,以下就主要介绍怎样去实现。


http请求:
      代码開始之前,还是首先介绍以下怎样通过浏览器获取http请求信息。这一步是分析站点资源的第一步。在浏览器界面右键有“审查元素”这一功能(假设没找到。F12一样能够的),谷歌浏览器效果例如以下:

技术分享

    点击“审查元素”之后会出现例如以下界面:

技术分享

    当中的Network栏目是做爬虫应该重点关注的,打开会看到当前网页所有的http请求信息,例如以下图:技术分享

    单击每一个信息。能够看到http请求的具体信息。例如以下图所看到的:

技术分享

    通过程序伪装成浏览器请求的时候,就多须要关注Request Headers里面的信息,另一些须要登录的站点也是须要关注这些的。Response里面的信息就是server返回的内容,这里仅仅做对文本信息的处理,对图片、音频、视频等信息不做介绍。

    Response里面就包括这我们爬虫想获取的信息内容。假设里面的格式不好看的话。能够在浏览器中输入该http请求的url地址。然后右键-->查看网页源码的形式查看相关信息。

通过分析网页源码中的字符串。总结出统一的规则。提取对应的文本信息。


代码实现:

    CrawlBase类,模拟http请求的基类

 /**  
 *@Description: 获取网页信息基类
 */ 
package com.lulei.crawl;  

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;

import com.lulei.util.CharsetUtil;


public abstract class CrawlBase {
	private static Logger log = Logger.getLogger(CrawlBase.class);
	
	//链接源码
	private String pageSourceCode = "";
	//返回头信息
	private Header[] responseHeaders = null;
	//连接超时时间
	private static int connectTimeout = 3500;
	//连接读取时间
	private static int readTimeout = 3500;
	//默认最大訪问次数
	private static int maxConnectTimes = 3;
	//网页默认编码方式
	private static String charsetName = "iso-8859-1";
	private static HttpClient httpClient = new HttpClient();
	
	static {
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param method
	 * @param params
	 * @return
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: method方式訪问页面
	 */
	public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
		if ("post".equals(method) || "POST".equals(method)) {
			return readPageByPost(urlStr, charsetName, params);
		} else {
			return readPageByGet(urlStr, charsetName, params);	
		}
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Get方式訪问页面
	 */
	public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
		GetMethod getMethod = createGetMethod(urlStr, params);
		return readPage(getMethod, charsetName, urlStr);
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @param params
	 * @return 訪问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: Post方式訪问页面
	 */
	public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
		PostMethod postMethod = createPostMethod(urlStr, params);
		return readPage(postMethod, charsetName, urlStr);
	}
	
	/**
	 * @param method
	 * @param defaultCharset
	 * @param urlStr
	 * @return 訪问是否成功
	 * @throws HttpException
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 读取页面信息和头信息
	 */
	private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
		int n = maxConnectTimes;
		while (n > 0) {
			try {
				if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
					log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
					n--;
				} else {
					//获取头信息
					responseHeaders = method.getResponseHeaders();
					//获取页面源码
					InputStream inputStream = method.getResponseBodyAsStream();
					BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
					StringBuffer stringBuffer = new StringBuffer();
					String lineString = null;
					while ((lineString = bufferedReader.readLine()) != null){
						stringBuffer.append(lineString);
						stringBuffer.append("\n");
					}
					pageSourceCode = stringBuffer.toString();
					InputStream in =new  ByteArrayInputStream(pageSourceCode.getBytes(charsetName));
					String charset = CharsetUtil.getStreamCharset(in, defaultCharset);
					//以下这个推断是为了IP归属地查询特意加上去的
					if ("Big5".equals(charset)) {
						charset = "gbk";
					}
					if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {
						pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);
					}
					return true;
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(urlStr + " -- can‘t connect  " + (maxConnectTimes - n + 1));
				n--;
			}
		}
		return false;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return GetMethod
	 * @Author: lulei  
	 * @Description: 设置get请求參数
	 */
	@SuppressWarnings("rawtypes")
	private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
		GetMethod getMethod = new GetMethod(urlStr);
		if (params == null){
			return getMethod;
		}
		Iterator iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry entry = (Map.Entry) iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			getMethod.setRequestHeader(key, val);
		}
		return getMethod;
	}
	
	/**
	 * @param urlStr
	 * @param params
	 * @return PostMethod
	 * @Author: lulei  
	 * @Description: 设置post请求參数
	 */
	private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
		PostMethod postMethod = new PostMethod(urlStr);
		if (params == null){
			return postMethod;
		}
		Iterator<Entry<String, String>> iter = params.entrySet().iterator();
		while (iter.hasNext()) {
			Map.Entry<String, String> entry =  iter.next();
			String key = (String) entry.getKey();
			String val = (String) entry.getValue();
			postMethod.setParameter(key, val);
		}
		return postMethod;
	}
	
	/**
	 * @param urlStr
	 * @param charsetName
	 * @return 訪问是否成功
	 * @throws IOException
	 * @Author: lulei  
	 * @Description: 不设置不论什么头信息直接訪问网页
	 */
	public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
		return this.readPageByGet(urlStr, charsetName, null);
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 获取网页源码
	 */
	public String getPageSourceCode(){
		return pageSourceCode;
	}
	
	/**
	 * @return Header[]
	 * @Author: lulei  
	 * @Description: 获取网页返回头信息
	 */
	public Header[] getHeader(){
		return responseHeaders;
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 设置连接超时时间
	 */
	public void setConnectTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
	}
	
	/**
	 * @param timeout
	 * @Author: lulei  
	 * @Description: 设置读取超时时间
	 */
	public void setReadTimeout(int timeout){
		httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
	}
	
	/**
	 * @param maxConnectTimes
	 * @Author: lulei  
	 * @Description: 设置最大訪问次数,链接失败的情况下使用
	 */
	public static void setMaxConnectTimes(int maxConnectTimes) {
		CrawlBase.maxConnectTimes = maxConnectTimes;
	}

	/**
	 * @param connectTimeout
	 * @param readTimeout
	 * @Author: lulei  
	 * @Description: 设置连接超时时间和读取超时时间
	 */
	public void setTimeout(int connectTimeout, int readTimeout){
		setConnectTimeout(connectTimeout);
		setReadTimeout(readTimeout);
	}

	/**
	 * @param charsetName
	 * @Author: lulei  
	 * @Description: 设置默认编码方式
	 */
	public static void setCharsetName(String charsetName) {
		CrawlBase.charsetName = charsetName;
	}
}


    CrawlListPageBase类是CrawlBase的子类,实现了从页面中获取链接的URL信息基类

 /**  
  *@Description: 获取页面链接地址信息基类  
 */ 
package com.lulei.crawl;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.util.DoRegex;


public abstract class CrawlListPageBase extends CrawlBase {
	private String pageurl;
	
	/**
	* @param urlStr
	* @param charsetName
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
		readPageByGet(urlStr, charsetName);
		pageurl = urlStr;
	}
	
	/**
	* @param urlStr
	* @param charsetName
	* @param method
	* @param params
	* @throws IOException
	 */
	public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
		readPage(urlStr, charsetName, method, params);	
		pageurl = urlStr;
	}
	
	/**
	 * @return List<String>
	 * @Author: lulei  
	 * @Description: 返回页面上需求的链接地址
	 */
	public List<String> getPageUrls(){
		List<String> pageUrls = new ArrayList<String>();
		pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
		return pageUrls;
	}
	
	/**
	 * @return String
	 * @Author: lulei  
	 * @Description: 返回页面上需求的网址连接的正則表達式
	 */
	public abstract String getUrlRegexString();
	
	/**
	 * @return int
	 * @Author: lulei  
	 * @Description: 正則表達式中要去的字段位置
	 */
	public abstract int getUrlRegexStringNum();	
}

    DoRegex类,封装的一些基于正則表達式字符串匹配查找类
 /**  
 * @Description: 正则处理工具   
 */ 
package com.lulei.util;  

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
  
public class DoRegex {
	
	private static String rootUrlRegex = "(http://.*?/)";
	private static String currentUrlRegex = "(http://.*/)";
	private static String ChRegex = "([\u4e00-\u9fa5]+)";

	/**
	 * @param dealStr
	 * @param regexStr
	 * @param splitStr
	 * @param n
	 * @return String
	 * @Author: lulei  
	 * @Description: 正则匹配结果。每条记录用splitStr切割
	 */
	public static String getString(String dealStr, String regexStr, String splitStr, int n){
		String reStr = "";
		if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
			return reStr;
		}
		splitStr = (splitStr == null) ?

"" : splitStr; Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); StringBuffer stringBuffer = new StringBuffer(); while (matcher.find()) { stringBuffer.append(matcher.group(n).trim()); stringBuffer.append(splitStr); } reStr = stringBuffer.toString(); if (splitStr != "" && reStr.endsWith(splitStr)){ reStr = reStr.substring(0, reStr.length() - splitStr.length()); } return reStr; } /** * @param dealStr * @param regexStr * @param n * @return String * @Author: lulei * @Description: 正则匹配结果,将所有匹配记录组装成字符串 */ public static String getString(String dealStr, String regexStr, int n){ return getString(dealStr, regexStr, null, n); } /** * @param dealStr * @param regexStr * @param n * @return String * @Author: lulei * @Description: 正则匹配第一条结果 */ public static String getFirstString(String dealStr, String regexStr, int n){ if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return ""; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { return matcher.group(n).trim(); } return ""; } /** * @param dealStr * @param regexStr * @param n * @return ArrayList<String> * @Author: lulei * @Description: 正则匹配结果。将匹配结果组装成数组 */ public static List<String> getList(String dealStr, String regexStr, int n){ List<String> reArrayList = new ArrayList<String>(); if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return reArrayList; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { reArrayList.add(matcher.group(n).trim()); } return reArrayList; } /** * @param url * @param currentUrl * @return String * @Author: lulei * @Description: 组装网址,网页的url */ private static String getHttpUrl(String url, String currentUrl){ try { url = encodeUrlCh(url); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (url.indexOf("http") == 0){ return url; } if (url.indexOf("/") == 0){ return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1); } return getFirstString(currentUrl, currentUrlRegex, 1) + url; } /** * @param dealStr * @param regexStr * @param currentUrl * @param n * @return ArrayList<String> * @Author: lulei * @Description: 获取和正则匹配的绝对链接地址 */ public static List<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){ List<String> reArrayList = new ArrayList<String>(); if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){ return reArrayList; } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl)); } return reArrayList; } /** * @param url * @return * @throws UnsupportedEncodingException * @Author: lulei * @Description: 将连接地址中的中文进行编码处理 */ public static String encodeUrlCh (String url) throws UnsupportedEncodingException { while (true) { String s = getFirstString(url, ChRegex, 1); if ("".equals(s)){ return url; } url = url.replaceAll(s, URLEncoder.encode(s, "utf-8")); } } /** * @param dealStr * @param regexStr * @param array 正则位置数组 * @return * @Author:lulei * @Description: 获取所有 */ public static List<String[]> getListArray(String dealStr, String regexStr, int[] array) { List<String[]> reArrayList = new ArrayList<String[]>(); if (dealStr == null || regexStr == null || array == null) { return reArrayList; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return reArrayList; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { String[] ss = new String[array.length]; for (int i = 0; i < array.length; i++) { ss[i] = matcher.group(array[i]).trim(); } reArrayList.add(ss); } return reArrayList; } /** * @param dealStr * @param regexStr * @param array * @return * @Author:lulei * @Description: 获取所有 */ public static List<String> getStringArray(String dealStr, String regexStr, int[] array) { List<String> reStringList = new ArrayList<String>(); if (dealStr == null || regexStr == null || array == null) { return reStringList; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return reStringList; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < array.length; i++) { sb.append(matcher.group(array[i]).trim()); } reStringList.add(sb.toString()); } return reStringList; } /** * @param dealStr * @param regexStr * @param array 正则位置数组 * @return * @Author:lulei * @Description: 获取第一个 */ public static String[] getFirstArray(String dealStr, String regexStr, int[] array) { if (dealStr == null || regexStr == null || array == null) { return null; } for (int i = 0; i < array.length; i++) { if (array[i] < 1) { return null; } } Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher matcher = pattern.matcher(dealStr); while (matcher.find()) { String[] ss = new String[array.length]; for (int i = 0; i < array.length; i++) { ss[i] = matcher.group(array[i]).trim(); } return ss; } return null; } }


    CharsetUtil类。编码方式检測类

 /**  
 *@Description:  编码方式检測类  
 */ 
package com.lulei.util;  

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;
  
public class CharsetUtil {
	private static final CodepageDetectorProxy detector;
	
	static {//初始化探測器
		detector = CodepageDetectorProxy.getInstance();
		detector.add(new ParsingDetector(false));
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());
		detector.add(JChardetFacade.getInstance());
	}

	/**
	 * @param url
	 * @param defaultCharset
	 * @Author:lulei  
	 * @return 获取文件的编码方式
	 */
	public static String getStreamCharset (URL url, String defaultCharset) {
		if (url == null) {
			return defaultCharset;
		}
		try {
			//使用第三方jar包检測文件的编码
			Charset charset = detector.detectCodepage(url);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
	
	/**
	 * @param inputStream
	 * @param defaultCharset
	 * @return
	 * @Author:lulei  
	 * @Description: 获取文件流的编码方式
	 */
	public static String getStreamCharset (InputStream inputStream, String defaultCharset) {
		if (inputStream == null) {
			return defaultCharset;
		}
		int count = 200;
		try {
			count = inputStream.available();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			//使用第三方jar包检測文件的编码
			Charset charset = detector.detectCodepage(inputStream, count);
			if (charset != null) {
				return charset.name();
			}
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		return defaultCharset;
	}
}
    上面四个类,就实现了网络文本资源信息抓取的基本架构,以下就通过一个实际的样例介绍怎样使用上述类实现网络文本资源信息的获取。


百度新闻案例:

    1)找到百度新闻更新列表页,如http://news.baidu.com/n?cmd=4&class=civilnews&pn=1&from=tab 界面例如以下图所看到的:

技术分享


    文章URL链接地址例如以下图所看到的:

技术分享


    通过对源文件的分析。编写BaiduNewList类。实现百度新闻列表页信息的抓取,代码例如以下:

 /**  
 *@Description:   百度新闻滚动列表页,能够获取当前页面上的链接
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import com.lulei.crawl.CrawlListPageBase;
  
public class BaiduNewList extends CrawlListPageBase{
	private static HashMap<String, String> params;
	
	/**
	 * 加入相关头信息,对请求进行伪装
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}

	public BaiduNewList(String urlStr) throws IOException {
		super(urlStr, "utf-8", "get", params);  
	}

	@Override
	public String getUrlRegexString() {
		// TODO Auto-generated method stub  
		//新闻列表页中文章链接地址的正則表達式
		return "? <a href=\"(.*?

)\""; } @Override public int getUrlRegexStringNum() { // TODO Auto-generated method stub //链接地址在正則表達式中的位置 return 1; } /** * @param args * @throws IOException * @Author:lulei * @Description: 測试用例 */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub BaiduNewList baidu = new BaiduNewList("http://news.baidu.com/n?

cmd=4&class=sportnews&pn=1&from=tab"); for (String s : baidu.getPageUrls()) { System.out.println(s); } } }


    2)通过第一步获取的URL。得到新闻所在的内容页面URL。因为百度新闻列表页面上的新闻来自不同的站,所以非常难找到一个通用的结构。大多数的新闻类站点,内容都是放在p标签内,所以就採用了例如以下的方式获取新闻的内容,例如以下图:

技术分享


    News类具体实现例如以下所看到的:

 /**  
 *@Description:   新闻类站点新闻内容 
 */ 
package com.lulei.crawl.news;  

import java.io.IOException;
import java.util.HashMap;

import org.apache.commons.httpclient.HttpException;

import com.lulei.crawl.CrawlBase;
import com.lulei.util.DoRegex;
  
public class News extends CrawlBase{
	private String url;
	private String content;
	private String title;
	private String type;
	
	private static String contentRegex = "<p.*?>(.*?)</p>";
	private static String titleRegex = "<title>(.*?)</title>";
	private static int maxLength = 300;
	
	private static HashMap<String, String> params;
	/**
	 * 加入相关头信息,对请求进行伪装
	 */
	static {
		params = new HashMap<String, String>();
		params.put("Referer", "http://www.baidu.com");
		params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 默认p标签内的内容为正文。假设正文长度查过设置的最大长度,则截取前半部分
	 */
	private void setContent() {
		String content = DoRegex.getString(getPageSourceCode(), contentRegex, 1);
		content = content.replaceAll("\n", "")
									  .replaceAll("<script.*?/script>", "")
									  .replaceAll("<style.*?/style>", "")
									  .replaceAll("<.*?>", "");
		this.content = content.length() > maxLength ? content.substring(0, maxLength) : content;
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 默认title标签内的内容为标题
	 */
	private void setTitle() {
		this.title = DoRegex.getString(getPageSourceCode(), titleRegex, 1);;
	}
	
	public News(String url) throws HttpException, IOException {
		this.url = url;
		readPageByGet(url, "utf-8", params);
		setContent();
		setTitle();
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public String getContent() {
		return content;
	}

	public String getTitle() {
		return title;
	}

	public String getType() {
		return type;
	}

	public void setType(String type) {
		this.type = type;
	}

	public static void setMaxLength(int maxLength) {
		News.maxLength = maxLength;
	}

	/**
	 * @param args
	 * @throws HttpException
	 * @throws IOException
	 * @Author:lulei  
	 * @Description: 測试用例
	 */
	public static void main(String[] args) throws HttpException, IOException {
		// TODO Auto-generated method stub  
		News news = new News("http://we.sportscn.com/viewnews-1634777.html");
		System.out.println(news.getContent());
		System.out.println(news.getTitle());
	}

}

    3)编写抓取的入口,这里为了简单,仅仅做了两层的分析,所以新闻更新列表也的URL就直接写在程序中。例如以下图所看到的:

技术分享


    运行一次採集任务例如以下图所看到的:

技术分享


    在main函数里面仅仅须要一次性或周期性的去运行run函数就可以。具体代码例如以下:

 /**  
 *@Description:     
 */ 
package com.lulei.knn.data;  

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.lulei.crawl.news.BaiduNewList;
import com.lulei.crawl.news.News;
import com.lulei.knn.index.KnnIndex;
import com.lulei.knn.index.KnnSearch;
import com.lulei.util.ParseMD5;
  
public class CrawlNews {
	private static List<Info> infos;
	private static KnnIndex knnIndex = new KnnIndex();
	private static KnnSearch knnSearch = new KnnSearch();
	private static HashMap<String, Integer> result;
	
	static {
		infos = new ArrayList<Info>();
		infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=sportnews&pn=1&from=tab", "体育类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=2&from=tab", "体育类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=3&from=tab", "体育类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=1&sub=0", "军事类")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=mil&pn=2&sub=0", "军事类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=3&sub=0", "军事类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=1&sub=0", "財经类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=2&sub=0", "財经类")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=finannews&pn=3&sub=0", "財经类")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=internet&pn=1&from=tab", "互联网")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=housenews&pn=1&sub=0", "房产类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=2&sub=0", "房产类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=3&sub=0", "房产类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=1&sub=0", "游戏类")); infos.add(new Info("http://news.baidu.com/n?

cmd=4&class=gamenews&pn=2&sub=0", "游戏类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=3&sub=0", "游戏类")); } /** *@Description: 抓取网址信息 *@Author:lulei */ static class Info{ String url; String type; Info(String url, String type) { this.url = url; this.type = type; } } /** * @param info * @Author:lulei * @Description: 抓取一个列表页面下的新闻信息 */ private void crawl(Info info) { if (info == null) { return; } try { BaiduNewList baiduNewList = new BaiduNewList(info.url); List<String> urls = baiduNewList.getPageUrls(); for (String url : urls) { News news = new News(url); NewsBean newBean = new NewsBean(); newBean.setId(ParseMD5.parseStrToMd5L32(url)); newBean.setType(info.type); newBean.setUrl(url); newBean.setTitle(news.getTitle()); newBean.setContent(news.getContent()); //保存到索引文件里 knnIndex.add(newBean); //knn验证 if (news.getContent() == null || "".equals(news.getContent())) { result.put("E", 1+result.get("E")); continue; } if (info.type.equals(knnSearch.getType(news.getContent()))) { result.put("R", 1+result.get("R")); } else { result.put("W", 1+result.get("W")); } } } catch (Exception e) { e.printStackTrace(); } } /** * @Author:lulei * @Description: 启动入口 */ public void run() { result = new HashMap<String, Integer>(); result.put("R", 0); result.put("W", 0); result.put("E", 0); for (Info info : infos) { System.out.println(info.url + "------start"); crawl(info); System.out.println(info.url + "------end"); } try { knnIndex.commit(); System.out.println("R = " + result.get("R")); System.out.println("W = " + result.get("W")); System.out.println("E = " + result.get("E")); System.out.println("准确度:" + (result.get("R") * 1.0 / (result.get("R") + result.get("W")))); System.out.println("-------------finished---------------"); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { new CrawlNews().run(); } }


到此为止,一个完整的採集程序就完毕了。

以上是关于基于HttpClient实现网络爬虫~以百度新闻为例的主要内容,如果未能解决你的问题,请参考以下文章

基于HttpClient4.5实现网络爬虫

java网络爬虫爬取百度新闻

网络爬虫百度新闻标题及链接爬取

java简单实现网络爬虫

java简单实现网络爬虫

基于Scrapy框架的Python新闻爬虫