网页爬虫 Scala源代码

Posted 小生活与大数据

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了网页爬虫 Scala源代码相关的知识,希望对你有一定的参考价值。

在上一期的《人之将死,其言也善》图文中,K哥通过网页爬虫,下载到了美国德克萨斯州534位死刑犯的Last Statement。这期K哥为您奉上网页爬虫的 Scala源代码,供有兴趣的小伙伴一试~


老规矩,发现Bug的小伙伴,K哥给发红包~




源代码:

/**
* Created by DiamondK on 2016/2/28.
*/
import scala.concurrent._
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions._
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.util.regex.Pattern
import scala.xml._

object htmlCrawler extends App {

 val _htmlCrawler = new htmlCrawler("http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html",
   
filter = (url: String) => url.contains("/death_row/dr_info/") && url.endsWith("last.html") && !url.contains("../"),
   
onDataLoaded = (lastStmt: String) =>
   {
     println(lastStmt + "\n")
   }
 )

 _htmlCrawler.crawl

}

/**
*
@param startPage the root URL page where the crawler starts to crawl
*
@param filter the filter that the crawler uses to filter qualified URL pages
*
@param onDataLoaded the handler once URL pages downloading is complete
*/
class htmlCrawler(startPage: String,
             
filter: (String) => Boolean,
             
onDataLoaded: (String) => Any) {

 private val latch = new CountDownLatch(1)
 private val crawledPool = new HashSet[String]
 private val linkRegex = """ (src|href)="([^"]+)"|(src|href)='([^']+)' """.trim.r
 private val htmlTypeRegex = "\btext/html\b"

 
def crawl {
   crawlPages(startPage, parseURL(startPage)._2)
   latch.await()
 }

 /**Crawl HTML pages and fetch the contents that we want. */
 
private def crawlPages(pageUrl: String, pageContent: String) {
   val innerLinks = parseLinks(pageUrl, pageContent)
   innerLinks.foreach {
     link =>
       val future = Future(parseURL(link))
       future.onSuccess {
         case htmlData if containInnerLinks(htmlData._1) =>
           crawlPages(link, htmlData._2)
         case _ =>
       }
       future.onFailure {
         case e: Exception =>
           println(s"Error visiting $link !\n")
           e.printStackTrace()
         case _ =>
       }
   }
 }

 private def parseLinks(parentUrl: String, html: String) = {
   val baseHost = getURLBase(parentUrl)
   val innerLinks = fetchContainedLinks(html).map {
     case shortLink:String if shortLink.startsWith("/") =>
       baseHost + shortLink
     case fullLink:String if fullLink.startsWith("http:") || fullLink.startsWith("https:") =>
       fullLink
     case otherLink:String =>
       val index = parentUrl.lastIndexOf("/")
       parentUrl.substring(0, index) + "/" + otherLink
   }.filter {
     link => !crawledPool.contains(link) && this.filter(link)
   }
   println("Found " + innerLinks.size + " links at page " + parentUrl + "!\n")
   innerLinks
 }

 /**Fetch the contents that we want. */
 
private def parseURL(url: String):(Map[String, String], String) = {
   val uri = new URL(url)
   val conn = uri.openConnection().asInstanceOf[HttpURLConnection]
   conn.setConnectTimeout(100000)
   conn.setReadTimeout(1000000)
   val stream = conn.getInputStream
   val buf = Array.fill[Byte](4096)(0)
   var len = stream.read(buf)
   val out = new ByteArrayOutputStream
   while (len > -1) {
     out.write(buf, 0, len)
     len = stream.read(buf)
   }

   val headers = conn.getHeaderFields.toMap.map {
     head => (head._1, head._2.mkString(","))
   }

   val htmlDoc:String = out.toString()
   val htmlNode:xml.Node = XML.loadString(htmlDoc)
   val isThisDeathRow:Boolean = ((htmlNode \\ "body" \\ "div")(8) \\ "p").size > 5
   
val _lastStmt:String = if(isThisDeathRow) ((htmlNode \\ "body" \\ "div")(8) \\ "p")(5).text else ""
   
val lastStmt:String = if(_lastStmt.equals("Last Statement:"))
     ((htmlNode \\ "body" \\ "div")(8) \\ "p")(6).text
   else _lastStmt

   conn.disconnect()
   crawledPool.add(url)
   this.onDataLoaded(lastStmt)

   (headers, htmlDoc)
 }

 private def fetchContainedLinks(html: String) = {
   val list = for (m <- linkRegex.findAllIn(html).matchData if m.group(1) != null || m.group(3) != null) yield {
     if (m.group(1) != null) m.group(2) else m.group(4)
   }
   list.filter {
     link => !link.startsWith("#") && !link.startsWith("javascript:") && link != "" && !link.startsWith("mailto:")
   }.toSet

 }

 private def getURLBase(url: String):String = {
   val uri:URL = new URL(url)
   val port:String = if (uri.getPort() == -1 || uri.getPort() == 80) "" else ":" + uri.getPort()
   uri.getProtocol() + "://" + uri.getHost() + port
 }

 private def containInnerLinks(headers: Map[String, String]):Boolean = {
   val contentType:String = if (headers.contains("Content-Type")) headers("Content-Type") else null
   
Pattern.compile(htmlTypeRegex).matcher(contentType).find
 }

}


End

合作 | 投稿 | 咨询

K哥: wulei.bj.cn@gmail.com


以上是关于网页爬虫 Scala源代码的主要内容,如果未能解决你的问题,请参考以下文章

linux打开终端如何启动scala,如何在终端下运行Scala代码片段?

Scala爬虫刷博客阅读量

scrapy按顺序启动多个爬虫代码片段(python3)

scrapy主动退出爬虫的代码片段(python3)

用爬虫抓取网页得到的源代码和浏览器中看到的不一样运用了啥技术?

Java 网络爬虫获取网页源代码原理及实现