网页爬虫 Scala源代码
Posted 小生活与大数据
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了网页爬虫 Scala源代码相关的知识,希望对你有一定的参考价值。
在上一期的《人之将死,其言也善》图文中,K哥通过网页爬虫,下载到了美国德克萨斯州534位死刑犯的Last Statement。这期K哥为您奉上网页爬虫的 Scala源代码,供有兴趣的小伙伴一试~
老规矩,发现Bug的小伙伴,K哥给发红包~
源代码:
/**
* Created by DiamondK on 2016/2/28.
*/
import scala.concurrent._
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions._
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.util.regex.Pattern
import scala.xml._
object htmlCrawler extends App {
val _htmlCrawler = new htmlCrawler("http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html",
filter = (url: String) => url.contains("/death_row/dr_info/") && url.endsWith("last.html") && !url.contains("../"),
onDataLoaded = (lastStmt: String) =>
{
println(lastStmt + "\n")
}
)
_htmlCrawler.crawl
}
/**
* @param startPage the root URL page where the crawler starts to crawl
* @param filter the filter that the crawler uses to filter qualified URL pages
* @param onDataLoaded the handler once URL pages downloading is complete
*/
class htmlCrawler(startPage: String,
filter: (String) => Boolean,
onDataLoaded: (String) => Any) {
private val latch = new CountDownLatch(1)
private val crawledPool = new HashSet[String]
private val linkRegex = """ (src|href)="([^"]+)"|(src|href)='([^']+)' """.trim.r
private val htmlTypeRegex = "\btext/html\b"
def crawl {
crawlPages(startPage, parseURL(startPage)._2)
latch.await()
}
/**Crawl HTML pages and fetch the contents that we want. */
private def crawlPages(pageUrl: String, pageContent: String) {
val innerLinks = parseLinks(pageUrl, pageContent)
innerLinks.foreach {
link =>
val future = Future(parseURL(link))
future.onSuccess {
case htmlData if containInnerLinks(htmlData._1) =>
crawlPages(link, htmlData._2)
case _ =>
}
future.onFailure {
case e: Exception =>
println(s"Error visiting $link !\n")
e.printStackTrace()
case _ =>
}
}
}
private def parseLinks(parentUrl: String, html: String) = {
val baseHost = getURLBase(parentUrl)
val innerLinks = fetchContainedLinks(html).map {
case shortLink:String if shortLink.startsWith("/") =>
baseHost + shortLink
case fullLink:String if fullLink.startsWith("http:") || fullLink.startsWith("https:") =>
fullLink
case otherLink:String =>
val index = parentUrl.lastIndexOf("/")
parentUrl.substring(0, index) + "/" + otherLink
}.filter {
link => !crawledPool.contains(link) && this.filter(link)
}
println("Found " + innerLinks.size + " links at page " + parentUrl + "!\n")
innerLinks
}
/**Fetch the contents that we want. */
private def parseURL(url: String):(Map[String, String], String) = {
val uri = new URL(url)
val conn = uri.openConnection().asInstanceOf[HttpURLConnection]
conn.setConnectTimeout(100000)
conn.setReadTimeout(1000000)
val stream = conn.getInputStream
val buf = Array.fill[Byte](4096)(0)
var len = stream.read(buf)
val out = new ByteArrayOutputStream
while (len > -1) {
out.write(buf, 0, len)
len = stream.read(buf)
}
val headers = conn.getHeaderFields.toMap.map {
head => (head._1, head._2.mkString(","))
}
val htmlDoc:String = out.toString()
val htmlNode:xml.Node = XML.loadString(htmlDoc)
val isThisDeathRow:Boolean = ((htmlNode \\ "body" \\ "div")(8) \\ "p").size > 5
val _lastStmt:String = if(isThisDeathRow) ((htmlNode \\ "body" \\ "div")(8) \\ "p")(5).text else ""
val lastStmt:String = if(_lastStmt.equals("Last Statement:"))
((htmlNode \\ "body" \\ "div")(8) \\ "p")(6).text
else _lastStmt
conn.disconnect()
crawledPool.add(url)
this.onDataLoaded(lastStmt)
(headers, htmlDoc)
}
private def fetchContainedLinks(html: String) = {
val list = for (m <- linkRegex.findAllIn(html).matchData if m.group(1) != null || m.group(3) != null) yield {
if (m.group(1) != null) m.group(2) else m.group(4)
}
list.filter {
link => !link.startsWith("#") && !link.startsWith("javascript:") && link != "" && !link.startsWith("mailto:")
}.toSet
}
private def getURLBase(url: String):String = {
val uri:URL = new URL(url)
val port:String = if (uri.getPort() == -1 || uri.getPort() == 80) "" else ":" + uri.getPort()
uri.getProtocol() + "://" + uri.getHost() + port
}
private def containInnerLinks(headers: Map[String, String]):Boolean = {
val contentType:String = if (headers.contains("Content-Type")) headers("Content-Type") else null
Pattern.compile(htmlTypeRegex).matcher(contentType).find
}
}
End
合作 | 投稿 | 咨询
K哥: wulei.bj.cn@gmail.com
以上是关于网页爬虫 Scala源代码的主要内容,如果未能解决你的问题,请参考以下文章
linux打开终端如何启动scala,如何在终端下运行Scala代码片段?