scala 爬虫demo
来源:互联网 发布:mac电脑桌面图标 编辑:程序博客网 时间:2024/06/06 08:37
import scala.concurrent._
import scala.concurrent.duration._
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions._
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.io.PrintStream
import java.io.FileOutputStream
import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
object Crawler extends App {
new Crawler("http://www.qq.com", filter = (url: String) => url.contains(".qq.com")).crawl
}
/**
* @param startPage crawler would crawl from that page
* @param filter crawler just crawl those url which match the filter
* @param onComplete handler for download complete
*/
class Crawler(startPage: String,
filter: (String => Boolean) = (url: String) => true,
onDataLoaded: (String, Int, Array[Byte], Map[String, String]) => Any = (url: String, status: Int, data: Array[Byte], headers: Map[String, String]) => { println(s"download $url done") }) {
private val latch = new CountDownLatch(1)
private val linkRegex = """ (src|href)="([^"]+)"|(src|href)='([^']+)' """.trim.r
private val htmlTypeRegex = "\btext/html\b"
private val crawledPool = new HashSet[String]
def crawl {
crawlPageLinks(startPage, new String(get(startPage)._2))
latch.await()
}
private def crawlPageLinks(pageUrl: String, pageContent: String) {
val links = parseCrawlLinks(pageUrl, pageContent)
links.map {
link =>
val future = Future(get(link))
future.onSuccess {
case data if isTextPage(data._3) =>
crawlPageLinks(link, new String(data._2))
}
future.onFailure {
case e: Exception =>
println(s"visit $link error!")
e.printStackTrace
}
}
}
private def getFullUrl(parentUrl: String, link: String) = {
val baseHost = getHostBase(parentUrl)
link match {
case link if link.startsWith("/") => baseHost + link
case link if link.startsWith("http:") || link.startsWith("https:") => link
case _ =>
val index = parentUrl.lastIndexOf("/")
parentUrl.substring(0, index) + "/" + link
}
}
private def parseCrawlLinks(parentUrl: String, html: String) = {
val baseHost = getHostBase(parentUrl)
val links = fetchLinks(html).map {
link =>
link match {
case link if link.startsWith("/") => baseHost + link
case link if link.startsWith("http:") || link.startsWith("https:") => link
case _ =>
val index = parentUrl.lastIndexOf("/")
parentUrl.substring(0, index) + "/" + link
}
}.filter {
link => !crawledPool.contains(link) && this.filter(link)
}
println("find " + links.size + " links at page " + parentUrl)
links
}
def get(url: String) = {
val uri = new URL(url);
val conn = uri.openConnection().asInstanceOf[HttpURLConnection];
conn.setConnectTimeout(100000)
conn.setReadTimeout(1000000)
val stream = conn.getInputStream()
val buf = Array.fill[Byte](1024)(0)
var len = stream.read(buf)
val out = new ByteArrayOutputStream
while (len > -1) {
out.write(buf, 0, len)
len = stream.read(buf)
}
val data = out.toByteArray()
val status = conn.getResponseCode()
val headers = conn.getHeaderFields().toMap.map {
head => (head._1, head._2.mkString(","))
}
conn.disconnect
crawledPool.add(url)
this.onDataLoaded(url, status, data, headers)
(conn.getResponseCode(), data, headers)
}
private def fetchLinks(html: String) = {
val list = for (m <- linkRegex.findAllIn(html).matchData if (m.group(1) != null || m.group(3) != null)) yield {
if (m.group(1) != null) m.group(2) else m.group(4)
}
list.filter {
link => !link.startsWith("#") && !link.startsWith("javascript:") && link != "" && !link.startsWith("mailto:")
}.toSet
}
private def getHostBase(url: String) = {
val uri = new URL(url)
val portPart = if (uri.getPort() == -1 || uri.getPort() == 80) "" else ":" + uri.getPort()
uri.getProtocol() + "://" + uri.getHost() + portPart
}
private def isTextPage(headers: Map[String, String]) = {
val contentType = if (headers contains "Content-Type") headers("Content-Type") else null
contentType match {
case null => false
case contentType if contentType isEmpty => false
case contentType if Pattern.compile(htmlTypeRegex).matcher(contentType).find => true
case _ => false
}
}
}
import scala.concurrent.duration._
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions._
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.io.PrintStream
import java.io.FileOutputStream
import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
object Crawler extends App {
new Crawler("http://www.qq.com", filter = (url: String) => url.contains(".qq.com")).crawl
}
/**
* @param startPage crawler would crawl from that page
* @param filter crawler just crawl those url which match the filter
* @param onComplete handler for download complete
*/
class Crawler(startPage: String,
filter: (String => Boolean) = (url: String) => true,
onDataLoaded: (String, Int, Array[Byte], Map[String, String]) => Any = (url: String, status: Int, data: Array[Byte], headers: Map[String, String]) => { println(s"download $url done") }) {
private val latch = new CountDownLatch(1)
private val linkRegex = """ (src|href)="([^"]+)"|(src|href)='([^']+)' """.trim.r
private val htmlTypeRegex = "\btext/html\b"
private val crawledPool = new HashSet[String]
def crawl {
crawlPageLinks(startPage, new String(get(startPage)._2))
latch.await()
}
private def crawlPageLinks(pageUrl: String, pageContent: String) {
val links = parseCrawlLinks(pageUrl, pageContent)
links.map {
link =>
val future = Future(get(link))
future.onSuccess {
case data if isTextPage(data._3) =>
crawlPageLinks(link, new String(data._2))
}
future.onFailure {
case e: Exception =>
println(s"visit $link error!")
e.printStackTrace
}
}
}
private def getFullUrl(parentUrl: String, link: String) = {
val baseHost = getHostBase(parentUrl)
link match {
case link if link.startsWith("/") => baseHost + link
case link if link.startsWith("http:") || link.startsWith("https:") => link
case _ =>
val index = parentUrl.lastIndexOf("/")
parentUrl.substring(0, index) + "/" + link
}
}
private def parseCrawlLinks(parentUrl: String, html: String) = {
val baseHost = getHostBase(parentUrl)
val links = fetchLinks(html).map {
link =>
link match {
case link if link.startsWith("/") => baseHost + link
case link if link.startsWith("http:") || link.startsWith("https:") => link
case _ =>
val index = parentUrl.lastIndexOf("/")
parentUrl.substring(0, index) + "/" + link
}
}.filter {
link => !crawledPool.contains(link) && this.filter(link)
}
println("find " + links.size + " links at page " + parentUrl)
links
}
def get(url: String) = {
val uri = new URL(url);
val conn = uri.openConnection().asInstanceOf[HttpURLConnection];
conn.setConnectTimeout(100000)
conn.setReadTimeout(1000000)
val stream = conn.getInputStream()
val buf = Array.fill[Byte](1024)(0)
var len = stream.read(buf)
val out = new ByteArrayOutputStream
while (len > -1) {
out.write(buf, 0, len)
len = stream.read(buf)
}
val data = out.toByteArray()
val status = conn.getResponseCode()
val headers = conn.getHeaderFields().toMap.map {
head => (head._1, head._2.mkString(","))
}
conn.disconnect
crawledPool.add(url)
this.onDataLoaded(url, status, data, headers)
(conn.getResponseCode(), data, headers)
}
private def fetchLinks(html: String) = {
val list = for (m <- linkRegex.findAllIn(html).matchData if (m.group(1) != null || m.group(3) != null)) yield {
if (m.group(1) != null) m.group(2) else m.group(4)
}
list.filter {
link => !link.startsWith("#") && !link.startsWith("javascript:") && link != "" && !link.startsWith("mailto:")
}.toSet
}
private def getHostBase(url: String) = {
val uri = new URL(url)
val portPart = if (uri.getPort() == -1 || uri.getPort() == 80) "" else ":" + uri.getPort()
uri.getProtocol() + "://" + uri.getHost() + portPart
}
private def isTextPage(headers: Map[String, String]) = {
val contentType = if (headers contains "Content-Type") headers("Content-Type") else null
contentType match {
case null => false
case contentType if contentType isEmpty => false
case contentType if Pattern.compile(htmlTypeRegex).matcher(contentType).find => true
case _ => false
}
}
}
阅读全文
0 0
- scala 爬虫demo
- 爬虫demo
- scala 实现简单爬虫
- scala 入门demo
- scala排序demo
- scala线程demo-newFixedThreadPool
- scala线程demo-CachedThread
- scala线程demo-callable
- scala demo - 继承关系
- Spark Scala Demo
- scala多线程demo
- java 爬虫Demo
- python 爬虫demo
- 网页爬虫简单demo
- Android 网络爬虫demo
- python爬虫demo
- Scrapy爬虫Demo
- Python爬虫demo
- PHP报错级别对照表
- VerifyError
- 【学习摘记】马士兵JDBC技术_课时16_DataSource_RowSet介绍和总结
- 线性逻辑回归的Python实现
- __attribute__ 你知多少?
- scala 爬虫demo
- Tensorflow入门五 AlexNet结构
- oracle中时间操作
- JavaScript—异步提交表单的6种方式
- 关于在python3.5下配置opencv3.0出现的问题
- Java中throw和throws的区别
- [Jdk源码学习]聊聊concurrent包下面的volite*
- 磁盘镜像分析工具TSK
- Git命令