Koltin实现简单爬虫功能【基于skrape】
用Kotlin的爬虫算是很小众了,业余研究,至此记录一下,与Python不同,用Kotlin爬虫挺怪的import cn.hutool.core.date.DateUtil
import cn.hutool.core.lang.Console
import cn.hutool.poi.excel.ExcelUtil
import com.qen.yanshen.THREAD_COUNT
import com.qen.yanshen.UserAgent
import it.skrape.core.htmlDocument
import it.skrape.fetcher.HttpFetcher
import it.skrape.fetcher.Method
import it.skrape.fetcher.response
import it.skrape.fetcher.skrape
import java.util.Date
import java.util.concurrent.atomic.AtomicInteger
import kotlin.collections.forEach
const val YanXuanUrl = "https://www.yanxuanwk.com/topic/tianya/"
const val YanXuanMaxPage = 3
const val YanxuanOutFile = "/tianya"
fun main() {
val threads = ArrayList<Thread>()
val pageNum = AtomicInteger()
var referer = YanXuanUrl
val list: MutableList<YanxuanArticle> = ArrayList<YanxuanArticle>()
for (i in 0..THREAD_COUNT) {
val thread = Thread {
try {
while (true) {
val index = pageNum.getAndIncrement()
val urlNew = if (index == 1) YanXuanUrl else YanXuanUrl + "page/${index}/"
if (index > YanXuanMaxPage) {
break
}
Console.log("${Thread.currentThread().threadId()}:pageNum:${index}")
try {
list.addAll(articleUrl(urlNew, referer, index))
} catch (e: Exception) {
e.printStackTrace()
}
referer = urlNew
}
} catch (e: Exception) {
e.printStackTrace()
}
}
thread.start()
threads.add(thread)
}
threads.forEach { t -> t.join() }
Console.error("多线程结束了")
val writer = ExcelUtil.getWriter(
YanxuanOutFile + "/${DateUtil.format(Date(), "YYYYMMdd")}/Tianya-${
DateUtil.format(
Date(),
"YYYYMMdd"
)
}.xlsx"
)
writer.passCurrentRow()
// writer.merge(quoteList.size - 1, "标题")
// writer.addHeaderAlias("title", "标题");
// writer.addHeaderAlias("time", "时间");
// writer.addHeaderAlias("url", "链接");
writer.write(list, true)
writer.close()
}
data class YanxuanArticle(
var title: String,
var url: String,
var time: String
)
fun initYanXuanHeader(): Map<String, String> {
return mutableMapOf(
"accept" to "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language" to "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control" to "no-cache",
"pragma" to "no-cache",
"priority" to "u=0, i",
"sec-ch-ua" to "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
"sec-ch-ua-mobile" to "?0",
"sec-ch-ua-platform" to "\"macOS\"",
"sec-fetch-dest" to "document",
"sec-fetch-mode" to "navigate",
"sec-fetch-site" to "same-origin",
"sec-fetch-user" to "?1",
"upgrade-insecure-requests" to "1"
)
}
fun initYanXuanCookie(): Map<String, String> {
return mutableMapOf(
"_ga" to "GA1.1.1256512866.1729579340",
"fontSize" to "18.00",
"_ga_B79H5MXFYF" to "GS1.1.1729579339.1.1.1729581094.0.0.0"
)
}
fun articleUrl(goUrl: String, refererUrl: String, i: Int): ArrayList<YanxuanArticle> {
val list = ArrayList<YanxuanArticle>()
var topurl = if (i == 1) "/topic/tianya/" else "/topic/tianya/page/$i/"
skrape(HttpFetcher) {
request {
url = goUrl
headers = initYanXuanHeader()
cookies = initYanXuanCookie()
userAgent = UserAgent
method = Method.GET
body = null
// proxy = ProxyBuilder(Proxy.Type.HTTP, "127.0.0.1", 26001)
}
response {
htmlDocument {
".entry-header" {
findAll {
forEach { t ->
val ch = t.findFirst(".entry-title").children
val a = ch.get(0)
val tm = t.findFirst(".entry-meta").findFirst(".meta-date").findFirst(".updated")
Console.log("title:${a.text};url:${a.attribute("href")}; time:${tm.text}")
list.add(
YanxuanArticle(
title = a.text,
url = a.attribute("href"),
time = tm.text
)
)
}
}
}
}
}
}
return list
}
感谢分享 感谢分享 感谢分享,skrape 确实不错 请问这个程序,能抓取全网特定行业的网站、邮箱、联系方式吗?谢谢! zoomyou 发表于 2024-11-22 11:47
请问这个程序,能抓取全网特定行业的网站、邮箱、联系方式吗?谢谢!
不行,这种方式很容易被识别出来,还是用python比较给力 licz 发表于 2024-11-22 13:42
不行,这种方式很容易被识别出来,还是用python比较给力
谢谢楼主恳切回复!
页:
[1]