申请会员ID:AsCenSion
1、申 请 I D:AsCenSion2、个人邮箱:549847849@qq.com
3、原创技术文章:
个人介绍:
个人比较喜欢爬虫,喜欢在网上通过爬虫下载一些资源。
原创内容介绍:
需求最近偶然对本子感兴趣起来,但是由于众所周知的原因本子这个东西不太容易下载,所以就想自己写个爬虫下载本子。源本人知道的不用科学上网就能访问的本子网站目前只有erocool。
地址发布页:https://tellmeurl.com/erocool/
[*]1
中文版地址:https://zh.erocool.me/
[*]1
世徒老师的本是真的顶,下面就以:
[世徒ゆうき] 千歳 -chitose- 第四話 (COMIC 夢幻転生 2020年8月号) [中国翻訳]
为例写个爬虫。
地址:https://zh.erocool.me/detail/1686650o321307.html
[*]1
正文分析用chrome的F12功能分析网页中本子图片的下载链接:
https://img-blog.csdnimg.cn/20200722172829122.png
可以看到“src=”的后面有个地址:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/1686650/1.jpg
[*]1
可能是图片下载地址,试一试,确实是。
多试几个本子,可得下载地址格式为:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/a/n.type
[*]1
其中a由本子决定,n是页数,type为文件格式,绝大多数为jpg,小部分为png。
a其实就是本子地址中间的一段数字,可以用re.search提取。
不难发现这个格式后半部分也可以构成一个地址,为:https://mi.404cdn.com/galleries/a/n.type
[*]1
尝试后发现这也是一个图片的下载地址,但是下载速度较慢。
为讨论方便,用A格式指代较长的格式,B格式指代较短的格式。A:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/a/n.typeB:https://mi.404cdn.com/galleries/a/n.type
[*]1
[*]2
[*]3
[*]4
二者差别如下:
1.A格式地址下载速度快,适合下载图片比例一般的漫画;
2.下载竖直/水平比例较大的漫画,尤其是韩漫的时候,A格式地址返回的图片清晰度极低,没法看,此时只能使用B格式下载。
A格式的下载链接实际上可以用Xpath提取,格式为://img[@class = "vimg lazyloaded"]/@src
[*]1
这在网页中使用Xpath提取是没有问题的,但是实际在pycharm中,可能由于图片是根据浏览位置来加载的,发现返回的html文件中是提取不到这个信息的。
所以在爬取图片的时候,这个链接就需要我们自己按照分析出来的格式来构造了。
这就导致下载的时候需要手动确定下载链接的格式以及文件类型。代码v1经过分析就可以码代码了,具体代码如下:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 下载文件def dld_erocool(erocool_urls, local_path, head_type, file_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # print(response.text) with open('temp.txt', 'wb') as file: file.write(response.content) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) for i in range(1, pic_num + 1): pic_name = str(i) + '.' + file_type pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 判断是否存在文件 exist = os.path.isfile(local_name) print('\t' + str(i) + '/' + str(pic_num)) if not exist: print('\t\t' + 'State: no such file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: # 判断文件是否有效 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: downloaded already') size = str(round(os.path.getsize(local_name) / 1024, 2)) print('\t\t' + 'File size: ' + size + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# erocool链接urls = [ 'https://zh.erocool.me/detail/1686650o321307.html',# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 图片类型# 一般是jpg;小部分是pngfile_type = 'jpg'# 下载dld_erocool(urls, local_path, head_type, file_type)
[*]1
[*]2
[*]3
[*]4
[*]5
[*]6
[*]7
[*]8
[*]9
[*]10
[*]11
[*]12
[*]13
[*]14
[*]15
[*]16
[*]17
[*]18
[*]19
[*]20
[*]21
[*]22
[*]23
[*]24
[*]25
[*]26
[*]27
[*]28
[*]29
[*]30
[*]31
[*]32
[*]33
[*]34
[*]35
[*]36
[*]37
[*]38
[*]39
[*]40
[*]41
[*]42
[*]43
[*]44
[*]45
[*]46
[*]47
[*]48
[*]49
[*]50
[*]51
[*]52
[*]53
[*]54
[*]55
[*]56
[*]57
[*]58
[*]59
[*]60
[*]61
[*]62
[*]63
[*]64
[*]65
[*]66
[*]67
[*]68
[*]69
[*]70
[*]71
[*]72
[*]73
[*]74
[*]75
[*]76
[*]77
[*]78
[*]79
[*]80
[*]81
[*]82
[*]83
[*]84
[*]85
[*]86
[*]87
[*]88
[*]89
[*]90
[*]91
[*]92
[*]93
[*]94
[*]95
[*]96
[*]97
[*]98
[*]99
[*]100
[*]101
[*]102
[*]103
[*]104
[*]105
[*]106
[*]107
[*]108
[*]109
[*]110
[*]111
[*]112
[*]113
[*]114
[*]115
[*]116
[*]117
[*]118
[*]119
[*]120
[*]121
[*]122
[*]123
[*]124
[*]125
[*]126
[*]127
[*]128
[*]129
[*]130
[*]131
[*]132
[*]133
[*]134
[*]135
[*]136
[*]137
[*]138
[*]139
[*]140
[*]141
[*]142
[*]143
[*]144
[*]145
[*]146
[*]147
正常运行结果:------------------------------------ URL: https://zh.erocool.me/detail/1686650o321307.html Local directory: F:/temp/aira2temp/erocool/[世徒ゆうき] 千歳 -chitose- 第四話 (COMIC 夢幻転生 2020年8月号) [中国翻訳] 1/42 State: no such file, to be downloaded Download finished File size: 460.71 KB 2/42 State: no such file, to be downloaded Download finished File size: 400.66 KB 3/42 State: no such file, to be downloaded Download finished File size: 434.52 KB 4/42 State: no such file, to be downloaded Download finished File size: 414.37 KB 5/42 State: no such file, to be downloaded Download finished File size: 381.43 KB 6/42 State: no such file, to be downloaded Download finished File size: 407.48 KB 7/42 State: no such file, to be downloaded Download finished File size: 388.39 KB 8/42 State: no such file, to be downloaded Download finished File size: 404.4 KB 9/42 State: no such file, to be downloaded Download finished File size: 432.16 KB 10/42 State: no such file, to be downloaded Download finished File size: 391.35 KB 11/42 State: no such file, to be downloaded Download finished File size: 365.73 KB 12/42 State: no such file, to be downloaded Download finished File size: 416.43 KB 13/42 State: no such file, to be downloaded Download finished File size: 367.79 KB 14/42 State: no such file, to be downloaded Download finished File size: 396.92 KB 15/42 State: no such file, to be downloaded Download finished File size: 385.43 KB 16/42 State: no such file, to be downloaded Download finished File size: 479.14 KB 17/42 State: no such file, to be downloaded Download finished File size: 444.29 KB 18/42 State: no such file, to be downloaded Download finished File size: 405.21 KB 19/42 State: no such file, to be downloaded Download finished File size: 398.37 KB 20/42 State: no such file, to be downloaded Download finished File size: 455.32 KB 21/42 State: no such file, to be downloaded Download finished File size: 409.46 KB 22/42 State: no such file, to be downloaded Download finished File size: 365.91 KB 23/42 State: no such file, to be downloaded Download finished File size: 397.73 KB 24/42 State: no such file, to be downloaded Download finished File size: 491.33 KB 25/42 State: no such file, to be downloaded Download finished File size: 420.46 KB 26/42 State: no such file, to be downloaded Download finished File size: 418.84 KB 27/42 State: no such file, to be downloaded Download finished File size: 449.82 KB 28/42 State: no such file, to be downloaded Download finished File size: 489.8 KB 29/42 State: no such file, to be downloaded Download finished File size: 487.49 KB 30/42 State: no such file, to be downloaded Download finished File size: 434.8 KB 31/42 State: no such file, to be downloaded Download finished File size: 352.65 KB 32/42 State: no such file, to be downloaded Download finished File size: 369.91 KB 33/42 State: no such file, to be downloaded Download finished File size: 481.56 KB 34/42 State: no such file, to be downloaded Download finished File size: 371.06 KB 35/42 State: no such file, to be downloaded Download finished File size: 333.14 KB 36/42 State: no such file, to be downloaded Download finished File size: 367.17 KB 37/42 State: no such file, to be downloaded Download finished File size: 403.3 KB 38/42 State: no such file, to be downloaded Download finished File size: 405.74 KB 39/42 State: no such file, to be downloaded Download finished File size: 457.22 KB 40/42 State: no such file, to be downloaded Download finished File size: 425.29 KB 41/42 State: no such file, to be downloaded Download finished File size: 438.93 KB 42/42 State: no such file, to be downloaded Download finished File size: 417.93 KBProcess finished with exit code 0
[*]1
[*]2
[*]3
[*]4
[*]5
[*]6
[*]7
[*]8
[*]9
[*]10
[*]11
[*]12
[*]13
[*]14
[*]15
[*]16
[*]17
[*]18
[*]19
[*]20
[*]21
[*]22
[*]23
[*]24
[*]25
[*]26
[*]27
[*]28
[*]29
[*]30
[*]31
[*]32
[*]33
[*]34
[*]35
[*]36
[*]37
[*]38
[*]39
[*]40
[*]41
[*]42
[*]43
[*]44
[*]45
[*]46
[*]47
[*]48
[*]49
[*]50
[*]51
[*]52
[*]53
[*]54
[*]55
[*]56
[*]57
[*]58
[*]59
[*]60
[*]61
[*]62
[*]63
[*]64
[*]65
[*]66
[*]67
[*]68
[*]69
[*]70
[*]71
[*]72
[*]73
[*]74
[*]75
[*]76
[*]77
[*]78
[*]79
[*]80
[*]81
[*]82
[*]83
[*]84
[*]85
[*]86
[*]87
[*]88
[*]89
[*]90
[*]91
[*]92
[*]93
[*]94
[*]95
[*]96
[*]97
[*]98
[*]99
[*]100
[*]101
[*]102
[*]103
[*]104
[*]105
[*]106
[*]107
[*]108
[*]109
[*]110
[*]111
[*]112
[*]113
[*]114
[*]115
[*]116
[*]117
[*]118
[*]119
[*]120
[*]121
[*]122
[*]123
[*]124
[*]125
[*]126
[*]127
[*]128
[*]129
[*]130
[*]131
[*]132
[*]133
[*]134
[*]135
[*]136
[*]137
[*]138
[*]139
[*]140
[*]141
[*]142
[*]143
[*]144
[*]145
[*]146
[*]147
[*]148
[*]149
[*]150
[*]151
[*]152
[*]153
[*]154
[*]155
[*]156
[*]157
[*]158
[*]159
[*]160
[*]161
[*]162
[*]163
[*]164
[*]165
[*]166
[*]167
[*]168
[*]169
[*]170
[*]171
[*]172
[*]173
[*]174
[*]175
在主程序下方可以更改下载位置以及本子链接。
不足之处在于具体图片下载链接类型、图片类型要根据本子具体类型自行判断。原因就是本子链接返回的html内容中没有这些信息。
各位大佬如果有能够通过本子链接返回的html内容来提取图片下载链接类型、图片类型的方法,万望在评论区教我。v2第二版代码。
相比上一班的改进:默认下载jpg文件,如果文件太小则会认为文件无效,继续下载png文件,所以这一版不需要知道图片具体的类型是jpg还是png,可以应对漫画内容既有jpg也有png的情况。
以下代码下载的是一个既有jpg也有png的本子。
代码:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 下载文件def dld_erocool(erocool_urls, local_path, head_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # print(response.text) with open('temp.txt', 'wb') as file: file.write(response.content) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') ero_name = ero_name.replace('?', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) for i in range(1, pic_num + 1): pic_name = str(i) + '.' + 'jpg' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) print('\t' + str(i) + '/' + str(pic_num)) # 判断是否存在jpg在文件 exist = os.path.isfile(local_name) if not exist: print('\t\t' + 'State: no jpg file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: jpg downloaded already') # 判断jpg文件是否有效,无效重新下载 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid jpg file, try again') os.remove(local_name) pic_name = str(i) + '.' + 'jpg' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') # 判断jpg文件是否有效,无效则下载png if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid jpg file, try png') os.remove(local_name) pic_name = str(i) + '.' + 'png' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 判断是否存在png在文件 exist = os.path.isfile(local_name) if not exist: print('\t\t' + 'State: no png file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: png downloaded already') # 判断png文件是否有效,无效重新下载 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid png file, try again') os.remove(local_name) pic_name = str(i) + '.' + 'png' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = str(round(os.path.getsize(local_name) / 1024, 2)) print('\t\t' + 'File size: ' + size + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# erocool链接urls = [ 'https://zh.erocool.me/detail/1547299o296458.html',# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 下载dld_erocool(urls, local_path, head_type)
[*]1
[*]2
[*]3
[*]4
[*]5
[*]6
[*]7
[*]8
[*]9
[*]10
[*]11
[*]12
[*]13
[*]14
[*]15
[*]16
[*]17
[*]18
[*]19
[*]20
[*]21
[*]22
[*]23
[*]24
[*]25
[*]26
[*]27
[*]28
[*]29
[*]30
[*]31
[*]32
[*]33
[*]34
[*]35
[*]36
[*]37
[*]38
[*]39
[*]40
[*]41
[*]42
[*]43
[*]44
[*]45
[*]46
[*]47
[*]48
[*]49
[*]50
[*]51
[*]52
[*]53
[*]54
[*]55
[*]56
[*]57
[*]58
[*]59
[*]60
[*]61
[*]62
[*]63
[*]64
[*]65
[*]66
[*]67
[*]68
[*]69
[*]70
[*]71
[*]72
[*]73
[*]74
[*]75
[*]76
[*]77
[*]78
[*]79
[*]80
[*]81
[*]82
[*]83
[*]84
[*]85
[*]86
[*]87
[*]88
[*]89
[*]90
[*]91
[*]92
[*]93
[*]94
[*]95
[*]96
[*]97
[*]98
[*]99
[*]100
[*]101
[*]102
[*]103
[*]104
[*]105
[*]106
[*]107
[*]108
[*]109
[*]110
[*]111
[*]112
[*]113
[*]114
[*]115
[*]116
[*]117
[*]118
[*]119
[*]120
[*]121
[*]122
[*]123
[*]124
[*]125
[*]126
[*]127
[*]128
[*]129
[*]130
[*]131
[*]132
[*]133
[*]134
[*]135
[*]136
[*]137
[*]138
[*]139
[*]140
[*]141
[*]142
[*]143
[*]144
[*]145
[*]146
[*]147
[*]148
[*]149
[*]150
[*]151
[*]152
[*]153
[*]154
[*]155
[*]156
[*]157
[*]158
[*]159
[*]160
[*]161
[*]162
[*]163
[*]164
[*]165
[*]166
[*]167
[*]168
[*]169
[*]170
[*]171
[*]172
[*]173
[*]174
[*]175
[*]176
[*]177
[*]178
正常运行结果:------------------------------------ URL: https://zh.erocool.me/detail/1547299o296458.html Local directory: F:/temp/aira2temp/erocool/[文雅] Aの輪上に踊る (コミック エグゼ 22) [中国翻訳] 1/27 State: no jpg file, to be downloaded Download finished File size: 433.84 KB 2/27 State: no jpg file, to be downloaded Download finished File size: 430.27 KB 3/27 State: no jpg file, to be downloaded Download finished File size: 406.48 KB 4/27 State: no jpg file, to be downloaded Download finished File size: 437.8 KB 5/27 State: no jpg file, to be downloaded Download finished File size: 492.18 KB 6/27 State: no jpg file, to be downloaded Download finished File size: 494.39 KB 7/27 State: no jpg file, to be downloaded Download finished File size: 461.91 KB 8/27 State: no jpg file, to be downloaded Download finished File size: 419.5 KB 9/27 State: no jpg file, to be downloaded Download finished File size: 442.8 KB 10/27 State: no jpg file, to be downloaded Download finished File size: 408.69 KB 11/27 State: no jpg file, to be downloaded Download finished File size: 393.77 KB 12/27 State: no jpg file, to be downloaded Download finished File size: 394.89 KB 13/27 State: no jpg file, to be downloaded Download finished File size: 436.82 KB 14/27 State: no jpg file, to be downloaded Download finished File size: 410.19 KB 15/27 State: no jpg file, to be downloaded Download finished File size: 440.01 KB 16/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1201.58 KB 17/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1109.1 KB 18/27 State: no jpg file, to be downloaded Download finished File size: 408.06 KB 19/27 State: no jpg file, to be downloaded Download finished File size: 386.98 KB 20/27 State: no jpg file, to be downloaded Download finished File size: 435.79 KB 21/27 State: no jpg file, to be downloaded Download finished File size: 423.48 KB 22/27 State: no jpg file, to be downloaded Download finished File size: 414.71 KB 23/27 State: no jpg file, to be downloaded Download finished File size: 405.81 KB 24/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1176.14 KB 25/27 State: no jpg file, to be downloaded Download finished File size: 432.11 KB 26/27 State: no jpg file, to be downloaded Download finished File size: 372.41 KB 27/27 State: no jpg file, to be downloaded Download finished File size: 173.54 KBProcess finished with exit code 0
[*]1
[*]2
[*]3
[*]4
[*]5
[*]6
[*]7
[*]8
[*]9
[*]10
[*]11
[*]12
[*]13
[*]14
[*]15
[*]16
[*]17
[*]18
[*]19
[*]20
[*]21
[*]22
[*]23
[*]24
[*]25
[*]26
[*]27
[*]28
[*]29
[*]30
[*]31
[*]32
[*]33
[*]34
[*]35
[*]36
[*]37
[*]38
[*]39
[*]40
[*]41
[*]42
[*]43
[*]44
[*]45
[*]46
[*]47
[*]48
[*]49
[*]50
[*]51
[*]52
[*]53
[*]54
[*]55
[*]56
[*]57
[*]58
[*]59
[*]60
[*]61
[*]62
[*]63
[*]64
[*]65
[*]66
[*]67
[*]68
[*]69
[*]70
[*]71
[*]72
[*]73
[*]74
[*]75
[*]76
[*]77
[*]78
[*]79
[*]80
[*]81
[*]82
[*]83
[*]84
[*]85
[*]86
[*]87
[*]88
[*]89
[*]90
[*]91
[*]92
[*]93
[*]94
[*]95
[*]96
[*]97
[*]98
[*]99
[*]100
[*]101
[*]102
[*]103
[*]104
[*]105
[*]106
[*]107
[*]108
[*]109
[*]110
[*]111
[*]112
[*]113
[*]114
[*]115
[*]116
[*]117
[*]118
[*]119
[*]120
[*]121
[*]122
[*]123
[*]124
[*]125
[*]126
[*]127
[*]128
[*]129
[*]130
V3改进了下载jpg与png的逻辑,增加了从下载页面提取本子url的功能。
代码:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 从搜索页面提取urlsdef get_urls(search_url): # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } response = get(search_url, header) if response is None: return [ url_head = 'https://zh.erocool.me' urls = etree.HTML(response.text).xpath('//div[@class="list-wrapper"]/a/@href') for i in range(len(urls)): urls[i = url_head + urls[i print(urls[i) return urls# 下载文件def dld_erocool(erocool_urls, local_path, head_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') ero_name = ero_name.replace('?', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) # 第一张的默认下载类型 # 1:jpg # 0:png default_type = 1 for i in range(1, pic_num + 1): print('\t' + str(i) + '/' + str(pic_num)) pic_name_jpg = str(i) + '.' + 'jpg' pic_url_jpg = url_head + url_mid + '/' + pic_name_jpg local_name_jpg = local_ero_path + '/' + pic_name_jpg.zfill(10) pic_name_png = str(i) + '.' + 'png' pic_url_png = url_head + url_mid + '/' + pic_name_png local_name_png = local_ero_path + '/' + pic_name_png.zfill(10) size = 0 while size < 1024: # 是否存在jpg在文件 exist_jpg = os.path.isfile(local_name_jpg) # 是否存在png在文件 exist_png = os.path.isfile(local_name_png) # 如果都不存在 if (not exist_jpg) & (not exist_png): if default_type == 1: print('\t\t' + 'State: no jpg or png file, try jpg') # 下载jpg with open(local_name_jpg, 'wb') as file: content_temp = get(pic_url_jpg, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_jpg) elif default_type == 0: print('\t\t' + 'State: no jpg or png file, try png') # 下载jpg with open(local_name_png, 'wb') as file: content_temp = get(pic_url_png, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_png) # 如果jpg存在 elif exist_jpg: # 如果jpg无效 if os.path.getsize(local_name_jpg) < 1024: # 下载png print('\t\t' + 'State: invalid jpg file, try png') os.remove(local_name_jpg) with open(local_name_png, 'wb') as file: content_temp = get(pic_url_png, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_png) default_type = 0 else: print('\t\t' + 'State: valid jpg file') size = os.path.getsize(local_name_jpg) # 如果png存在 elif exist_png: # 如果png无效 if os.path.getsize(local_name_png) < 1024: # 下载jpg print('\t\t' + 'State: invalid png file, try jpg') os.remove(local_name_png) with open(local_name_jpg, 'wb') as file: content_temp = get(pic_url_jpg, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_jpg) default_type = 1 else: print('\t\t' + 'State: valid png file') size = os.path.getsize(local_name_png) print('\t\t' + 'File size: ' + str(round(size / 1024, 2)) + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# 用搜索链接提取urlssearch_url = ''# 直接给具体本子的urlsurls2 = [# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 下载urls1 = get_urls(search_url)dld_erocool(urls1, local_path, head_type)dld_erocool(urls2, local_path, head_type)
[*]1
[*]2
[*]3
[*]4
[*]5
[*]6
[*]7
[*]8
[*]9
[*]10
[*]11
[*]12
[*]13
[*]14
[*]15
[*]16
[*]17
[*]18
[*]19
[*]20
[*]21
[*]22
[*]23
[*]24
[*]25
[*]26
[*]27
[*]28
[*]29
[*]30
[*]31
[*]32
[*]33
[*]34
[*]35
[*]36
[*]37
[*]38
[*]39
[*]40
[*]41
[*]42
[*]43
[*]44
[*]45
[*]46
[*]47
[*]48
[*]49
[*]50
[*]51
[*]52
[*]53
[*]54
[*]55
[*]56
[*]57
[*]58
[*]59
[*]60
[*]61
[*]62
[*]63
[*]64
[*]65
[*]66
[*]67
[*]68
[*]69
[*]70
[*]71
[*]72
[*]73
[*]74
[*]75
[*]76
[*]77
[*]78
[*]79
[*]80
[*]81
[*]82
[*]83
[*]84
[*]85
[*]86
[*]87
[*]88
[*]89
[*]90
[*]91
[*]92
[*]93
[*]94
[*]95
[*]96
[*]97
[*]98
[*]99
[*]100
[*]101
[*]102
[*]103
[*]104
[*]105
[*]106
[*]107
[*]108
[*]109
[*]110
[*]111
[*]112
[*]113
[*]114
[*]115
[*]116
[*]117
[*]118
[*]119
[*]120
[*]121
[*]122
[*]123
[*]124
[*]125
[*]126
[*]127
[*]128
[*]129
[*]130
[*]131
[*]132
[*]133
[*]134
[*]135
[*]136
[*]137
[*]138
[*]139
[*]140
[*]141
[*]142
[*]143
[*]144
[*]145
[*]146
[*]147
[*]148
[*]149
[*]150
[*]151
[*]152
[*]153
[*]154
[*]155
[*]156
[*]157
[*]158
[*]159
[*]160
[*]161
[*]162
[*]163
[*]164
[*]165
[*]166
[*]167
[*]168
[*]169
[*]170
[*]171
[*]172
[*]173
[*]174
[*]175
[*]176
[*]177
[*]178
[*]179
[*]180
[*]181
[*]182
[*]183
[*]184
[*]185
[*]186
[*]187
[*]188
[*]189
[*]190
[*]191
[*]192
[*]193
[*]194
[*]195
[*]196
[*]197
[*]198
[*]199
[*]200
[*]201
[*]202
[*]203
[*]204
[*]205
[*]206
[*]207
代码中没有具体的下载链接。 抱歉,未能达到申请要求,申请不通过,可以关注论坛官方微信(吾爱破解论坛),等待开放注册通知。
页:
[1]