Koltin实现简单爬虫功能【基于skrape】

licz 发表于 2024-11-12 13:54

用Kotlin的爬虫算是很小众了，业余研究，至此记录一下，与Python不同，用Kotlin爬虫挺怪的

import cn.hutool.core.date.DateUtil
import cn.hutool.core.lang.Console
import cn.hutool.poi.excel.ExcelUtil
import com.qen.yanshen.THREAD_COUNT
import com.qen.yanshen.UserAgent
import it.skrape.core.htmlDocument
import it.skrape.fetcher.HttpFetcher
import it.skrape.fetcher.Method
import it.skrape.fetcher.response
import it.skrape.fetcher.skrape
import java.util.Date
import java.util.concurrent.atomic.AtomicInteger
import kotlin.collections.forEach

const val YanXuanUrl = "https://www.yanxuanwk.com/topic/tianya/"
const val YanXuanMaxPage = 3
const val YanxuanOutFile = "/tianya"

fun main() {
val threads = ArrayList<Thread>()
val pageNum = AtomicInteger()

var referer = YanXuanUrl
val list: MutableList<YanxuanArticle> = ArrayList<YanxuanArticle>()
for (i in 0..THREAD_COUNT) {
   val thread = Thread {
         try {

            while (true) {
               val index = pageNum.getAndIncrement()
               val urlNew = if (index == 1) YanXuanUrl else YanXuanUrl + "page/${index}/"

               if (index > YanXuanMaxPage) {
                     break
               }

               Console.log("${Thread.currentThread().threadId()}：pageNum:${index}")
               try {
                     list.addAll(articleUrl(urlNew, referer, index))
               } catch (e: Exception) {
                     e.printStackTrace()
               }

               referer = urlNew
            }
         } catch (e: Exception) {
            e.printStackTrace()
         }
   }
   thread.start()
   threads.add(thread)
}

threads.forEach { t -> t.join() }

Console.error("多线程结束了")
val writer = ExcelUtil.getWriter(
   YanxuanOutFile + "/${DateUtil.format(Date(), "YYYYMMdd")}/Tianya-${
         DateUtil.format(
            Date(),
            "YYYYMMdd"
         )
   }.xlsx"
)
writer.passCurrentRow()
// writer.merge(quoteList.size - 1, "标题")
// writer.addHeaderAlias("title", "标题");
// writer.addHeaderAlias("time", "时间");
// writer.addHeaderAlias("url", "链接");
writer.write(list, true)
writer.close()
}

data class YanxuanArticle(
var title: String,
var url: String,
var time: String
)

fun initYanXuanHeader(): Map<String, String> {
return mutableMapOf(
   "accept" to "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
   "accept-language" to "zh-CN,zh;q=0.9,en;q=0.8",
   "cache-control" to "no-cache",
   "pragma" to "no-cache",
   "priority" to "u=0, i",
   "sec-ch-ua" to "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
   "sec-ch-ua-mobile" to "?0",
   "sec-ch-ua-platform" to "\"macOS\"",
   "sec-fetch-dest" to "document",
   "sec-fetch-mode" to "navigate",
   "sec-fetch-site" to "same-origin",
   "sec-fetch-user" to "?1",
   "upgrade-insecure-requests" to "1"
)
}

fun initYanXuanCookie(): Map<String, String> {
return mutableMapOf(
   "_ga" to "GA1.1.1256512866.1729579340",
   "fontSize" to "18.00",
   "_ga_B79H5MXFYF" to "GS1.1.1729579339.1.1.1729581094.0.0.0"
)
}

fun articleUrl(goUrl: String, refererUrl: String, i: Int): ArrayList<YanxuanArticle> {
val list = ArrayList<YanxuanArticle>()
var topurl = if (i == 1) "/topic/tianya/" else "/topic/tianya/page/$i/"
skrape(HttpFetcher) {
   request {
         url = goUrl
         headers = initYanXuanHeader()
         cookies = initYanXuanCookie()
         userAgent = UserAgent
         method = Method.GET
         body = null
//          proxy = ProxyBuilder(Proxy.Type.HTTP, "127.0.0.1", 26001)
   }
   response {
         htmlDocument {
            ".entry-header" {
               findAll {
                     forEach { t ->
                        val ch = t.findFirst(".entry-title").children
                        val a = ch.get(0)
                        val tm = t.findFirst(".entry-meta").findFirst(".meta-date").findFirst(".updated")
                        Console.log("title:${a.text};url:${a.attribute("href")}; time:${tm.text}")
                        list.add(
                           YanxuanArticle(
                                 title = a.text,
                                 url = a.attribute("href"),
                                 time = tm.text
                           )
                        )
                     }
               }
            }
         }
   }
}
return list
}

左服发表于 2024-11-12 15:50

感谢分享

J3ggedPeak 发表于 2024-11-12 16:26

感谢分享

Vixb1122 发表于 2024-11-12 16:38

感谢分享，skrape 确实不错

zoomyou 发表于 2024-11-22 11:47

请问这个程序，能抓取全网特定行业的网站、邮箱、联系方式吗？谢谢！

licz 发表于 2024-11-22 13:42

zoomyou 发表于 2024-11-22 11:47
请问这个程序，能抓取全网特定行业的网站、邮箱、联系方式吗？谢谢！

不行，这种方式很容易被识别出来，还是用python比较给力

zoomyou 发表于 2024-11-22 22:03

licz 发表于 2024-11-22 13:42
不行，这种方式很容易被识别出来，还是用python比较给力

谢谢楼主恳切回复！

页: [1]

吾爱破解 - 52pojie.cn's Archiver

Koltin实现简单爬虫功能【基于skrape】