开发环境 python3.6
需要设置环境变量。 缺点,运行程序会出现弹窗。
[Python] 纯文本查看 复制代码 #coding:UTF-8
import threading
import urllib.request
import time
import os
import re
index_url = "你要克隆的git第一页"
# 列 index_url = "https://github.com/search?q=51%E5%8D%95%E7%89%87%E6%9C%BA"
class Rein_git:
def __init__(self, index_url):
self.index_url = index_url
def return_html_url(self, html):
href_re = '''<a class="v-align-middle" .*? href=(.*?)>'''
html_data = urllib.request.urlopen(html)
new_html_file = re.findall(href_re, str(html_data.read()))
return new_html_file
def return_html_next(self, html):
href_re = ''' <a rel="next" href="(.*?)">'''
html_data = urllib.request.urlopen(html)
next_html = re.findall(href_re, str(html_data.read()))
if not next_html:
return next_html
next_html = "https://github.com/" + next_html[0]
return next_html
def clone_the_git_jop(data):
for c in data:
os.system("git clone https://github.com"+c.replace('"',""))
if __name__ == "__main__":
thread_data = []
rein_git = Rein_git(index_url)
file_html = []
while True:
file_url = rein_git.return_html_url(index_url)
thread_o = threading.Thread(target = clone_the_git_jop, args=(file_url,))
thread_o.start()
thread_data.append(thread_o)
file_html.extend(file_url)
index_url = rein_git.return_html_next(index_url)
if not index_url:
for o in thread_data:
o.join()
break
time.sleep(1)
|