import
requests
import
os,random,re
from
lxml
import
etree
import
threading
from
queue
import
Queue
class
Httprequest(
object
):
ua_list
=
[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
,
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
,
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
,
]
@property
def
random_headers(
self
):
return
{
'User-Agent'
: random.choice(
self
.ua_list)
}
class
Procuder(threading.Thread,Httprequest):
def
__init__(
self
,page_queue,img_queue,
*
args,
*
*
kwargs):
super
(Procuder,
self
).__init__(
*
args,
*
*
kwargs)
self
.url
=
"http://www.billwang.net/html/blogs/"
self
.page_queue
=
page_queue
self
.img_queue
=
img_queue
def
run(
self
):
while
True
:
if
self
.page_queue.empty():
break
url
=
self
.page_queue.get()
self
.get_list(url)
def
get_list(
self
, url):
print
(f
">>> 正在爬取列表页 {url}"
)
html
=
requests.get(url, headers
=
self
.random_headers, timeout
=
5
).content.decode(
'utf-8'
)
req
=
etree.HTML(html)
hrefs
=
req.xpath(
'//div[@class="txtbox"]/a/@href'
)
for
href
in
hrefs:
href
=
f
'{self.url.split("/html")[0]}{href}'
self
.get_content(href)
def
get_content(
self
, url):
print
(f
">>> 正在爬取详情页 {url}"
)
html
=
requests.get(url, headers
=
self
.random_headers, timeout
=
5
).content.decode(
'utf-8'
)
req
=
etree.HTML(html)
title
=
req.xpath(
'//div[@class="detail-con"]/h1[@class="title"]/text()'
)[
0
]
h1
=
self
.validate_title(title)
content
=
req.xpath(
'//div[@class="content"]//text()'
)
content
=
self
.deal_content(content)
content_req
=
req.xpath(
'//div[@class="content"]'
)[
0
]
imgs
=
content_req.xpath(
'*//img/@src'
)
data
=
(h1, content, imgs)
print
(data)
self
.img_queue.put(data)
@staticmethod
def
validate_title(title):
pattern
=
r
"[\/\\\:\*\?\"\<\>\|]"
new_title
=
re.sub(pattern,
"_"
, title)
return
new_title
@staticmethod
def
deal_content(content):
content.remove(
'\n '
)
content.remove(
' '
)
content
=
' '
.join(content)
content
=
content.replace(
'免责声明:本站目的在于分享更多信息,不代表本站的观点和立场,版权归原作者所有。若有侵权或异议请联系我们删除。'
, '')
return
content
class
Consumer(threading.Thread,Httprequest):
def
__init__(
self
,page_queue,img_queue,
*
args,
*
*
kwargs):
super
(Consumer,
self
).__init__(
*
args,
*
*
kwargs)
self
.page_queue
=
page_queue
self
.img_queue
=
img_queue
self
.path
=
f
'billw/'
def
save_content(
self
,h1,content,path):
os.makedirs(
self
.path, exist_ok
=
True
)
print
(f
">>> 开始保存 {h1}文本内容"
)
text
=
'%s%s%s'
%
(h1,
'\n'
, content)
with
open
(f
'{path}{h1}.txt'
,
'w'
, encoding
=
'utf-8'
) as f:
f.write(text)
print
(
">>> 保存成功!"
)
def
save_imgs(
self
,imgs,path):
i
=
1
for
img
in
imgs:
img_url
=
img
img_name
=
f
'{i}{os.path.splitext(img)[-1]}'
img_path
=
f
'{path}{img_name}'
self
.save_img(img_url, img_name, img_path)
i
=
i
+
1
def
save_img(
self
, img_url, img_name, img_path):
print
(f
">>> 开始保存 {img_name} 图片"
)
r
=
requests.get(img_url, headers
=
self
.random_headers,timeout
=
5
)
with
open
(img_path,
'wb'
) as f:
f.write(r.content)
print
(f
">>> 保存 {img_name} 图片成功"
)
def
run(
self
):
while
True
:
if
self
.page_queue.empty()
and
self
.img_queue.empty():
break
data
=
self
.img_queue.get()
h1
=
data[
0
]
content
=
data[
1
]
imgs
=
data[
2
]
path
=
f
'billw/{h1}/'
os.makedirs(path, exist_ok
=
True
)
self
.save_content(h1, content, path)
self
.save_imgs(imgs, path)
def
main():
page_queue
=
Queue(
100
)
img_queue
=
Queue(
10000
)
for
i
in
range
(
1
,
21
):
url
=
"http://www.billwang.net/html/blogs/%d/"
%
i
print
(f
'>>> 正在爬取 第{i}页 列表页,链接:{url} ...'
)
page_queue.put(url)
for
x
in
range
(
2
):
t
=
Procuder(page_queue,img_queue)
t.start()
for
x
in
range
(
8
):
t
=
Consumer(page_queue,img_queue)
t.start()
if
__name__
=
=
"__main__"
:
main()