1、申 请 I D:AsCenSion
2、个人邮箱:549847849@qq.com
3、原创技术文章:
个人介绍:
个人比较喜欢爬虫,喜欢在网上通过爬虫下载一些资源。
原创内容介绍:
需求最近偶然对本子感兴趣起来,但是由于众所周知的原因本子这个东西不太容易下载,所以就想自己写个爬虫下载本子。源本人知道的不用科学上网就能访问的本子网站目前只有erocool。
地址发布页:https://tellmeurl.com/erocool/中文版地址:https://zh.erocool.me/世徒老师的本是真的顶,下面就以:
[世徒ゆうき] 千歳 -chitose- 第四話 (COMIC 夢幻転生 2020年8月号) [中国翻訳]
为例写个爬虫。
地址:https://zh.erocool.me/detail/1686650o321307.html正文分析用chrome的F12功能分析网页中本子图片的下载链接:
可以看到“src=”的后面有个地址:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/1686650/1.jpg可能是图片下载地址,试一试,确实是。
多试几个本子,可得下载地址格式为:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/a/n.type其中a由本子决定,n是页数,type为文件格式,绝大多数为jpg,小部分为png。
a其实就是本子地址中间的一段数字,可以用re.search提取。
不难发现这个格式后半部分也可以构成一个地址,为:https://mi.404cdn.com/galleries/a/n.type尝试后发现这也是一个图片的下载地址,但是下载速度较慢。
为讨论方便,用A格式指代较长的格式,B格式指代较短的格式。A:https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/a/n.typeB:https://mi.404cdn.com/galleries/a/n.type二者差别如下:
1.A格式地址下载速度快,适合下载图片比例一般的漫画;
2.下载竖直/水平比例较大的漫画,尤其是韩漫的时候,A格式地址返回的图片清晰度极低,没法看,此时只能使用B格式下载。
A格式的下载链接实际上可以用Xpath提取,格式为://img[@class = "vimg lazyloaded"]/@src这在网页中使用Xpath提取是没有问题的,但是实际在pycharm中,可能由于图片是根据浏览位置来加载的,发现返回的html文件中是提取不到这个信息的。
所以在爬取图片的时候,这个链接就需要我们自己按照分析出来的格式来构造了。
这就导致下载的时候需要手动确定下载链接的格式以及文件类型。代码v1经过分析就可以码代码了,具体代码如下:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 下载文件def dld_erocool(erocool_urls, local_path, head_type, file_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # print(response.text) with open('temp.txt', 'wb') as file: file.write(response.content) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) for i in range(1, pic_num + 1): pic_name = str(i) + '.' + file_type pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 判断是否存在文件 exist = os.path.isfile(local_name) print('\t' + str(i) + '/' + str(pic_num)) if not exist: print('\t\t' + 'State: no such file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: # 判断文件是否有效 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: downloaded already') size = str(round(os.path.getsize(local_name) / 1024, 2)) print('\t\t' + 'File size: ' + size + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# erocool链接urls = [ 'https://zh.erocool.me/detail/1686650o321307.html',# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 图片类型# 一般是jpg;小部分是pngfile_type = 'jpg'# 下载dld_erocool(urls, local_path, head_type, file_type)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
正常运行结果:------------------------------------ URL: https://zh.erocool.me/detail/1686650o321307.html Local directory: F:/temp/aira2temp/erocool/[世徒ゆうき] 千歳 -chitose- 第四話 (COMIC 夢幻転生 2020年8月号) [中国翻訳] 1/42 State: no such file, to be downloaded Download finished File size: 460.71 KB 2/42 State: no such file, to be downloaded Download finished File size: 400.66 KB 3/42 State: no such file, to be downloaded Download finished File size: 434.52 KB 4/42 State: no such file, to be downloaded Download finished File size: 414.37 KB 5/42 State: no such file, to be downloaded Download finished File size: 381.43 KB 6/42 State: no such file, to be downloaded Download finished File size: 407.48 KB 7/42 State: no such file, to be downloaded Download finished File size: 388.39 KB 8/42 State: no such file, to be downloaded Download finished File size: 404.4 KB 9/42 State: no such file, to be downloaded Download finished File size: 432.16 KB 10/42 State: no such file, to be downloaded Download finished File size: 391.35 KB 11/42 State: no such file, to be downloaded Download finished File size: 365.73 KB 12/42 State: no such file, to be downloaded Download finished File size: 416.43 KB 13/42 State: no such file, to be downloaded Download finished File size: 367.79 KB 14/42 State: no such file, to be downloaded Download finished File size: 396.92 KB 15/42 State: no such file, to be downloaded Download finished File size: 385.43 KB 16/42 State: no such file, to be downloaded Download finished File size: 479.14 KB 17/42 State: no such file, to be downloaded Download finished File size: 444.29 KB 18/42 State: no such file, to be downloaded Download finished File size: 405.21 KB 19/42 State: no such file, to be downloaded Download finished File size: 398.37 KB 20/42 State: no such file, to be downloaded Download finished File size: 455.32 KB 21/42 State: no such file, to be downloaded Download finished File size: 409.46 KB 22/42 State: no such file, to be downloaded Download finished File size: 365.91 KB 23/42 State: no such file, to be downloaded Download finished File size: 397.73 KB 24/42 State: no such file, to be downloaded Download finished File size: 491.33 KB 25/42 State: no such file, to be downloaded Download finished File size: 420.46 KB 26/42 State: no such file, to be downloaded Download finished File size: 418.84 KB 27/42 State: no such file, to be downloaded Download finished File size: 449.82 KB 28/42 State: no such file, to be downloaded Download finished File size: 489.8 KB 29/42 State: no such file, to be downloaded Download finished File size: 487.49 KB 30/42 State: no such file, to be downloaded Download finished File size: 434.8 KB 31/42 State: no such file, to be downloaded Download finished File size: 352.65 KB 32/42 State: no such file, to be downloaded Download finished File size: 369.91 KB 33/42 State: no such file, to be downloaded Download finished File size: 481.56 KB 34/42 State: no such file, to be downloaded Download finished File size: 371.06 KB 35/42 State: no such file, to be downloaded Download finished File size: 333.14 KB 36/42 State: no such file, to be downloaded Download finished File size: 367.17 KB 37/42 State: no such file, to be downloaded Download finished File size: 403.3 KB 38/42 State: no such file, to be downloaded Download finished File size: 405.74 KB 39/42 State: no such file, to be downloaded Download finished File size: 457.22 KB 40/42 State: no such file, to be downloaded Download finished File size: 425.29 KB 41/42 State: no such file, to be downloaded Download finished File size: 438.93 KB 42/42 State: no such file, to be downloaded Download finished File size: 417.93 KBProcess finished with exit code 0- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
在主程序下方可以更改下载位置以及本子链接。
不足之处在于具体图片下载链接类型、图片类型要根据本子具体类型自行判断。原因就是本子链接返回的html内容中没有这些信息。
各位大佬如果有能够通过本子链接返回的html内容来提取图片下载链接类型、图片类型的方法,万望在评论区教我。v2第二版代码。
相比上一班的改进:默认下载jpg文件,如果文件太小则会认为文件无效,继续下载png文件,所以这一版不需要知道图片具体的类型是jpg还是png,可以应对漫画内容既有jpg也有png的情况。
以下代码下载的是一个既有jpg也有png的本子。
代码:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 下载文件def dld_erocool(erocool_urls, local_path, head_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # print(response.text) with open('temp.txt', 'wb') as file: file.write(response.content) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') ero_name = ero_name.replace('?', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) for i in range(1, pic_num + 1): pic_name = str(i) + '.' + 'jpg' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) print('\t' + str(i) + '/' + str(pic_num)) # 判断是否存在jpg在文件 exist = os.path.isfile(local_name) if not exist: print('\t\t' + 'State: no jpg file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: jpg downloaded already') # 判断jpg文件是否有效,无效重新下载 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid jpg file, try again') os.remove(local_name) pic_name = str(i) + '.' + 'jpg' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') # 判断jpg文件是否有效,无效则下载png if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid jpg file, try png') os.remove(local_name) pic_name = str(i) + '.' + 'png' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 判断是否存在png在文件 exist = os.path.isfile(local_name) if not exist: print('\t\t' + 'State: no png file, to be downloaded') # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') else: print('\t\t' + 'State: png downloaded already') # 判断png文件是否有效,无效重新下载 if os.path.getsize(local_name) < 1024: print('\t\t' + 'State: invalid png file, try again') os.remove(local_name) pic_name = str(i) + '.' + 'png' pic_url = url_head + url_mid + '/' + pic_name local_name = local_ero_path + '/' + pic_name.zfill(10) # 下载 with open(local_name, 'wb') as file: content_temp = get(pic_url, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = str(round(os.path.getsize(local_name) / 1024, 2)) print('\t\t' + 'File size: ' + size + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# erocool链接urls = [ 'https://zh.erocool.me/detail/1547299o296458.html',# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 下载dld_erocool(urls, local_path, head_type)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
正常运行结果:------------------------------------ URL: https://zh.erocool.me/detail/1547299o296458.html Local directory: F:/temp/aira2temp/erocool/[文雅] Aの輪上に踊る (コミック エグゼ 22) [中国翻訳] [DL版] 1/27 State: no jpg file, to be downloaded Download finished File size: 433.84 KB 2/27 State: no jpg file, to be downloaded Download finished File size: 430.27 KB 3/27 State: no jpg file, to be downloaded Download finished File size: 406.48 KB 4/27 State: no jpg file, to be downloaded Download finished File size: 437.8 KB 5/27 State: no jpg file, to be downloaded Download finished File size: 492.18 KB 6/27 State: no jpg file, to be downloaded Download finished File size: 494.39 KB 7/27 State: no jpg file, to be downloaded Download finished File size: 461.91 KB 8/27 State: no jpg file, to be downloaded Download finished File size: 419.5 KB 9/27 State: no jpg file, to be downloaded Download finished File size: 442.8 KB 10/27 State: no jpg file, to be downloaded Download finished File size: 408.69 KB 11/27 State: no jpg file, to be downloaded Download finished File size: 393.77 KB 12/27 State: no jpg file, to be downloaded Download finished File size: 394.89 KB 13/27 State: no jpg file, to be downloaded Download finished File size: 436.82 KB 14/27 State: no jpg file, to be downloaded Download finished File size: 410.19 KB 15/27 State: no jpg file, to be downloaded Download finished File size: 440.01 KB 16/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1201.58 KB 17/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1109.1 KB 18/27 State: no jpg file, to be downloaded Download finished File size: 408.06 KB 19/27 State: no jpg file, to be downloaded Download finished File size: 386.98 KB 20/27 State: no jpg file, to be downloaded Download finished File size: 435.79 KB 21/27 State: no jpg file, to be downloaded Download finished File size: 423.48 KB 22/27 State: no jpg file, to be downloaded Download finished File size: 414.71 KB 23/27 State: no jpg file, to be downloaded Download finished File size: 405.81 KB 24/27 State: no jpg file, to be downloaded Download finished State: invalid jpg file, try again Download finished State: invalid jpg file, try png State: no png file, to be downloaded Download finished File size: 1176.14 KB 25/27 State: no jpg file, to be downloaded Download finished File size: 432.11 KB 26/27 State: no jpg file, to be downloaded Download finished File size: 372.41 KB 27/27 State: no jpg file, to be downloaded Download finished File size: 173.54 KBProcess finished with exit code 0- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
V3改进了下载jpg与png的逻辑,增加了从下载页面提取本子url的功能。
代码:import reimport requestsfrom lxml import etreeimport os# 函数定义# 创建文件夹def mkdir(path): # 判断是否存在文件夹如果不存在则创建为文件夹 # 如果路径不存在会创建这个路径 folder = os.path.exists(path) if not folder: os.makedirs(path)# 超时处理def get(url, header): i = 0 while i < 3: try: result = requests.get(url, headers=header, timeout=5) return result except requests.exceptions.RequestException: print("TIME OUT " + str(i+1)) i += 1# 补零操作def zero_fill(path): file_list = os.listdir(path) for file in file_list: if not file.endswith('.txt'): # 补0 10表示补0后名字共10位 filename = file.zfill(10) os.rename(path + '/' + file, path + '/' + filename)# 从搜索页面提取urlsdef get_urls(search_url): # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } response = get(search_url, header) if response is None: return [ url_head = 'https://zh.erocool.me' urls = etree.HTML(response.text).xpath('//div[@class="list-wrapper"]/a/@href') for i in range(len(urls)): urls[i = url_head + urls[i print(urls[i) return urls# 下载文件def dld_erocool(erocool_urls, local_path, head_type): # 功能 # 下载erocool链接中的图片 # erocool_urls # erocool链接 # local_path # 下载位置,将在该位置创建文件夹 # head_type # 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫) for erocool_url in erocool_urls: print('------------------------------------') # 确定下载链接url中部,从erocool链接中提取 url_mid = re.search('detail/(.*)o', erocool_url).group(1) # 确定下载链接url头部,由head_type确定 if head_type == 1: url_head = 'https://search.pstatic.net/common?src=https://mi.404cdn.com/galleries/' elif head_type == 0: url_head = 'https://mi.404cdn.com/galleries/' # User Agent header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 'Cookie': '_ga=GA1.2.99173477.1570796706; csrftoken=OK1ZGOurCtTNFgBhOEauJm3krQyQVR28xSP7Zu9EEv8MjiCgwdQyPyKqViaGkmG4; Hm_lvt_7fdef555dc32f7d31fadd14999021b7b=1570796701,1570941042; _gid=GA1.2.160071259.1570941044; Hm_lpvt_7fdef555dc32f7d31fadd14999021b7b=1570941059', 'Connection': 'close' } # 请求 print('\tURL:') print('\t\t' + erocool_url) response = get(erocool_url, header) # 选取数据:总页数、名称 pic_num = int( re.search('(.*) 頁', etree.HTML(response.text).xpath('//div[@class = "ld_box"]/div/div/text()')[3).group(1)) ero_name = etree.HTML(response.text).xpath('//h1/text()')[0 ero_name = ero_name.replace('/', '-') ero_name = ero_name.replace(':', '-') ero_name = ero_name.replace('?', '-') # 创建文件夹 local_ero_path = local_path + '/' + ero_name mkdir(local_ero_path) local_url_path = local_ero_path + '/url.txt' with open(local_url_path, 'w') as file: file.write(erocool_url) print('\t' + 'Local directory:') print('\t\t' + local_ero_path) # 第一张的默认下载类型 # 1:jpg # 0:png default_type = 1 for i in range(1, pic_num + 1): print('\t' + str(i) + '/' + str(pic_num)) pic_name_jpg = str(i) + '.' + 'jpg' pic_url_jpg = url_head + url_mid + '/' + pic_name_jpg local_name_jpg = local_ero_path + '/' + pic_name_jpg.zfill(10) pic_name_png = str(i) + '.' + 'png' pic_url_png = url_head + url_mid + '/' + pic_name_png local_name_png = local_ero_path + '/' + pic_name_png.zfill(10) size = 0 while size < 1024: # 是否存在jpg在文件 exist_jpg = os.path.isfile(local_name_jpg) # 是否存在png在文件 exist_png = os.path.isfile(local_name_png) # 如果都不存在 if (not exist_jpg) & (not exist_png): if default_type == 1: print('\t\t' + 'State: no jpg or png file, try jpg') # 下载jpg with open(local_name_jpg, 'wb') as file: content_temp = get(pic_url_jpg, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_jpg) elif default_type == 0: print('\t\t' + 'State: no jpg or png file, try png') # 下载jpg with open(local_name_png, 'wb') as file: content_temp = get(pic_url_png, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_png) # 如果jpg存在 elif exist_jpg: # 如果jpg无效 if os.path.getsize(local_name_jpg) < 1024: # 下载png print('\t\t' + 'State: invalid jpg file, try png') os.remove(local_name_jpg) with open(local_name_png, 'wb') as file: content_temp = get(pic_url_png, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_png) default_type = 0 else: print('\t\t' + 'State: valid jpg file') size = os.path.getsize(local_name_jpg) # 如果png存在 elif exist_png: # 如果png无效 if os.path.getsize(local_name_png) < 1024: # 下载jpg print('\t\t' + 'State: invalid png file, try jpg') os.remove(local_name_png) with open(local_name_jpg, 'wb') as file: content_temp = get(pic_url_jpg, header) file.write(content_temp.content) print('\t\t' + 'Download finished') size = os.path.getsize(local_name_jpg) default_type = 1 else: print('\t\t' + 'State: valid png file') size = os.path.getsize(local_name_png) print('\t\t' + 'File size: ' + str(round(size / 1024, 2)) + ' KB')# 主程序# 下载位置,将在该位置创建文件夹local_path = "F:/temp/aira2temp/erocool"# 用搜索链接提取urlssearch_url = ''# 直接给具体本子的urlsurls2 = [# 图片下载链接类型# 1:长链接,下载一般尺寸;0:短连接,下载竖直方向特长尺寸(韩漫)head_type = 1# 下载urls1 = get_urls(search_url)dld_erocool(urls1, local_path, head_type)dld_erocool(urls2, local_path, head_type)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
代码中没有具体的下载链接。 |