#!/usr/bin/python #-*- coding: utf-8 -*- import requests import sys,logging,traceback from bs4 import BeautifulSoup as bsp #计算顶层目录 num=0 num_2=0 data_url=[] tmp=[] def dg(url,url_head,url_tail,num,centos_ver): global tmp,num_3 r = requests.get("%s%s/%s"%(url,url_head,url_tail)) html=r.content soup=bsp(html,‘html.parser‘) #显示所有内容 #print(soup.prettify()) try: #用soup的方法直接将title格式化有问题,这里手动格式化 dg_url_head=str(soup.title).replace(‘<title>‘,‘‘).replace(‘</title>‘,‘‘).split()[-1] for i in soup.find_all(‘a‘) : #用于获取centos版本 try: #判断失败则代表不是数字类型就继续 if not i.get(‘href‘).endswith(‘../‘) and num == 0 and float(i.get(‘href‘).split(‘/‘)[0].split()[0]) < centos_ver: #显示当前跳过的版本 #print(int(i.get(‘href‘).split(‘/‘)[0])) continue except: #判断包含.的并不完全正确,yum源里有的文件名有.这里做冗余 #print("%s%s"%(url,dg_url_head)) pass try: if not i.get(‘href‘).endswith(‘../‘) and num == 0: if float(i.get(‘href‘).split(‘/‘)[0].split()[0]) >= centos_ver: #显示当前在哪个版本 #print("%s%s/%s"%(url,url_head,i.get(‘href‘).split(‘/‘)[0])) error_log="%s%s/%s"%(url,url_head,i.get(‘href‘).split(‘/‘)[0]) logging.info(error_log) except: error_log="%s%s/%s"%(url,url_head,i.get(‘href‘).split(‘/‘)[0]) logging.info(error_log) if not i.get(‘href‘).endswith(‘../‘) and i.get(‘href‘).endswith(‘/‘): #显示完整目录路径 #print("%s%s%s"%(url,url_head,i.get(‘href‘))) #记录递归层数,每递归一次加1 num+=1 #每递归一层就记录一层目录 dg(url,dg_url_head,i.get(‘href‘),num,centos_ver) num_3=num_2 #递归结束一层则代表目录退出一层,所以num要建议 num-=1 elif not i.get(‘href‘).endswith(‘../‘): #pass #显示完整目录和下载路径 #print(‘/‘.join(tmp)) #print("%s%s%s"%(url,lj,i.get(‘href‘))) data_url.append("%s%s%s"%(url,dg_url_head,i.get(‘href‘))) except: #print("%s%s/%s"%(url,url_head,url_tail)) #print(soup.prettify()) traceback.print_exc() sys.exit(0) #print(url_head) def start(file,url,url_head,url_tail,centos_ver): url=url url_head=url_head url_tail=url_tail centos_ver=centos_ver data=dg(url,url_head,url_tail,num,centos_ver) output = open(file, ‘w‘) output.write(‘\n‘.join(data_url)) output.close( ) return ‘ok‘
上面保存为dg.py
1 #!/usr/bin/python 2 #-*- coding: utf-8 -*- 3 import urllib,sys,json,shutil 4 import os,requests,re,time 5 import dg,logging,traceback 6 from multiprocessing import Process,Pool 7 8 date_ymd=time.strftime("%Y-%m-%d", time.localtime()) 9 def date_time(): 10 return time.strftime("%Y-%m-%dT%H-%M-%S", time.localtime()) 11 #下载文件存储路径 12 file_path=‘/data/wwwroot/yum/centos‘ 13 #信息存储路径 14 file_dir=‘.‘ 15 file_dir_log="./log" 16 if not os.path.exists(file_dir): 17 os.makedirs(file_dir) 18 if not os.path.exists(file_dir_log): 19 os.makedirs(file_dir_log) 20 if not os.path.exists(file_path): 21 os.makedirs(file_path) 22 download_log_name="%s/download_log_%s.log"%(file_dir_log,date_ymd) 23 #存储下载行数 24 download_Record_name="%s/download_Record.lock"%file_dir 25 #下载列表 26 network_list="%s/all_list.txt"%file_dir 27 #进程数 28 process_num=6 29 #dg下载的地址 30 dg_url=‘https://mirrors.aliyun.com‘ 31 #dg_url目录 32 dg_url_head=‘/centos‘ 33 #文件 34 dg_url_tail=‘‘ 35 #指定开始的版本 36 dg_centos_ver=7 37 38 #存储日志 39 40 logging.basicConfig(level=logging.DEBUG, 41 format=‘%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s‘, 42 datefmt=‘[%Y-%m-%d %H:%M:%S]‘, 43 filename="%s_debug"%(download_log_name), 44 filemode=‘a‘) 45 46 47 ################################################################################################# 48 #定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象# 49 console = logging.StreamHandler() 50 console.setLevel(logging.INFO) 51 formatter = logging.Formatter(‘[%(asctime)s] %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s‘) 52 console.setFormatter(formatter) 53 logging.getLogger(‘‘).addHandler(console) 54 #输入一份到日志文件里 55 file_handler = logging.FileHandler(download_log_name) 56 file_handler.setLevel(logging.INFO) 57 file_handler.setFormatter(formatter) 58 logging.getLogger(‘‘).addHandler(file_handler) 59 ################################################################################################# 60 61 def date_def(): 62 date=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 63 return date 64 65 def Schedule(a,b,c): 66 ‘‘‘‘‘ 67 a:已经下载的数据块 68 b:数据块的大小 69 c:远程文件的大小 70 ‘‘‘ 71 per = 100.0 * a * b / c 72 if per > 100 : 73 per = 100 74 #print (‘%.2f%%‘ %(per)) 75 logging.debug(‘%.2f%%‘ %(per)) 76 77 def file_add_del(filename,data): 78 output = open(filename, ‘wb‘) 79 output.write(data) 80 output.close( ) 81 82 def file_log(filename,data): 83 output = open(filename, ‘a‘) 84 output.write(data) 85 output.close( ) 86 87 #获取需要下载的列表 88 #print(‘\n‘.join(add_list_dir_Size())) 89 90 def url_down(url_n,num): 91 #记录错误次数,重试3次 92 num=int(num) 93 error_num=0 94 if url_n != ‘‘: 95 url=url_n.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘) 96 r=requests.get(url) 97 #通过headers信息获取文件大小 98 size=r.headers[‘Content-Length‘] 99 100 dir=url.split(‘/‘) 101 file=dir[-1] 102 del dir[0:4] 103 del dir[-1] 104 dir=‘/‘.join(dir) 105 logging.debug(url) 106 while True: 107 108 109 #文件存在则重新下载 110 if os.path.exists(‘%s/%s/%s‘%(file_path,dir,file)): 111 os.remove(‘%s/%s/%s‘%(file_path,dir,file)) 112 #判断文件夹是否存在 113 if not os.path.exists(‘%s/%s‘%(file_path,dir)): 114 os.makedirs(‘%s/%s‘%(file_path,dir)) 115 url_date=date_time() 116 #下载进度 117 #urllib.urlretrieve(url,‘%s/%s/%s_%s‘%(file_path,dir,file,url_date),Schedule) 118 #urllib.urlretrieve(url,‘%s/%s/%s_%s‘%(file_path,dir,file,url_date)) 119 os.popen("wget --limit-rate=200k %s/%s/%s_%s %s"%(file_path,dir,file,url_date,url)) 120 shutil.move(‘%s/%s/%s_%s‘%(file_path,dir,file,url_date),‘%s/%s/%s‘%(file_path,dir,file)) 121 #文件下载后存在则判断大小是否一致 122 if os.path.exists(‘%s/%s/%s‘%(file_path,dir,file)): 123 path_size=os.path.getsize(‘%s/%s/%s‘%(file_path,dir,file)) 124 if float(path_size) == float(size): 125 #print({"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date}) 126 error_log=json.dumps({num:{"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date_def()}}) 127 #将正常的日志就输入到debug 128 logging.info(error_log) 129 #return error_log 130 #print("%s\t%s\n"%(date,error_log)) 131 break 132 else: 133 if error_num >2: 134 #print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date}) 135 error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date_def()}}) 136 #将正常的日志就输入到debug 137 logging.info(error_log) 138 #print("%s\t%s\n"%(date,error_log)) 139 break 140 return error_log 141 error_num+=1 142 #下载后文件不存在则重试 143 else: 144 if error_num >2: 145 #print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date}) 146 error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date_def()}}) 147 #将错误的日志输入到普通日志里 148 logging.error(error_log) #print("%s\t%s\n"%(date,error_log)) 149 #return error_log 150 #data_log[num]={"status":"error","url":url,"down_size":path_size,"list_szie":size.replace(‘\n‘,‘‘).replace(‘\r‘,‘‘),"num":error_num,"time":date} 151 break 152 error_num+=1 153 #获取本地文件 154 def dg_Local_files_and_network_files(path): 155 file_list=[] 156 for root, dirs, files in os.walk(path, topdown=False): 157 for name in files: 158 file_list.append(os.path.join(root, name).replace(path,"https://mirrors.aliyun.com/centos").replace("\\","/")) 159 160 return file_list 161 #校验 162 def delete(): 163 164 while True: 165 data=dg_Local_files_and_network_files(file_path) 166 network=open(network_list).read().split(‘\n‘) 167 new_network=[] 168 new_data=[] 169 for i in network: 170 if i != ‘‘: 171 new_network.append(i) 172 for i in data: 173 if i != ‘‘: 174 new_data.append(i) 175 delete_data=list(set(new_data)-set(new_network)) 176 add=list(set(new_network) - set(new_data)) 177 if not os.listdir(file_path): 178 logging.info("删除空目录%s"%‘/‘.join(file_path)) 179 if len(add) == 0 and len(delete_data) ==0: 180 logging.info("校验成功本地与网络无差别") 181 break 182 elif len(add) != 0: 183 for i in add: 184 if i!=‘‘: 185 logging.info("下载差异文件%s"%i) 186 url_down(i,"0") 187 elif len(delete_data) != 0: 188 for i in delete_data: 189 if i!=‘‘: 190 i=i.replace("https://mirrors.aliyun.com/centos/","%s/"%file_path) 191 logging.info("删除差异文件%s"%i) 192 os.remove(i) 193 194 195 #测试 196 #print("开始下载"%date) 197 #url_data() 198 if __name__ == ‘__main__‘: 199 while True: 200 try: 201 202 num=1 203 exit=0 204 205 if os.path.exists(download_Record_name): 206 logging.info("检测到上次未下载完,重新上次的下载") 207 dg_Local_files_and_network_files(file_path) 208 logging.info("开始下载") 209 ####################下面为下载方法############################################## ########################################################################## 210 mainStart = time.time() 211 num=0 212 #data_log=dict() 213 p = Pool(process_num) 214 nework_list=open(network_list).read().split(‘\n‘) 215 load_list=dg_Local_files_and_network_files(file_path) 216 for url_n in list(set(nework_list)-set(load_list)): 217 num+=1 218 #下载 219 p.apply_async(url_down,args=(url_n,str(num),)) 220 logging.info(‘等待所有子进程完成…‘) 221 p.close() 222 p.join() 223 mainEnd = time.time() 224 logging.info(‘所有进程运行 %s 秒.‘%(mainEnd-mainStart)) 225 #下载完成日志分割 226 file_log(download_log_name,"#"*100) 227 logging.info("下载完成") 228 logging.info("开始校验") 229 delete() 230 #下载完成清空进度 231 os.remove(download_Record_name) 232 ########################################################################## ########################################################################## 233 else: 234 #这里为第一次运行的 235 if not os.path.exists(download_Record_name): 236 logging.info("dg.py运行") 237 dg_po=dg.start(network_list,dg_url,dg_url_head,dg_url_tail,dg_centos_ver) 238 if ‘ok‘ not in dg_po: 239 logging.error("dg运行故障") 240 else: 241 file_add_del(download_Record_name,‘‘) 242 else: 243 logging.info("dg.py检测已经执行过了") 244 #如果network_list_old文件存在,就代表不是第一次下载,则进行筛选下载,判断有无更新 245 nework_list=open(network_list).read().split(‘\n‘) 246 load_list=dg_Local_files_and_network_files(file_path) 247 if len(list(set(nework_list) - set(load_list))) == 0: 248 logging.info("不用更新") 249 os.remove(download_Record_name) 250 exit=1 251 sys.exit(0) 252 253 254 #开始下载 255 if num == 1: 256 logging.info("开始下载") 257 file_add_del(download_Record_name,"0") 258 #######################下面为下载方法############################################# ########################################################################## 259 mainStart = time.time() 260 num=0 261 #data_log=dict() 262 p = Pool(process_num) 263 nework_list=open(network_list).read().split(‘\n‘) 264 load_list=dg_Local_files_and_network_files(file_path) 265 for url_n in list(set(nework_list)-set(load_list)): 266 num+=1 267 #下载 268 p.apply_async(url_down,args=(url_n,str(num),)) 269 logging.info(‘等待所有子进程完成…‘) 270 p.close() 271 p.join() 272 mainEnd = time.time() 273 logging.info(‘所有进程运行 %0.2f 秒.‘%(mainEnd-mainStart)) 274 logging.info("下载完成") 275 logging.info("开始校验") 276 delete() 277 #下载完成清空进度 278 os.remove(download_Record_name) 279 #下载完成日志分割 280 file_log(download_log_name,"#"*100) 281 ########################################################################## ########################################################################## 282 #运行结束删除锁 283 os.remove(download_Record_name) 284 logging.info("结束") 285 break 286 except: 287 if exit==0: 288 #for i in traceback.format_exc().split(‘\n‘): 289 # logging.error(i) 290 if not os.path.exists(download_Record_name) and os.path.exists(network_list): 291 logging.info("由于dg.py执行故障要将刚生成的以下文件去除") 292 os.remove(network_list) 293 logging.info(network_list) 294 logging.error(‘\n%s‘%traceback.format_exc())
这个保存为dg_download.py
执行pg_download.py就可以开始爬取了,可以修改里面的爬取版本
dg.py爬取镜像,pg_download.py判断是否需要更新
剩下的就是等爬取好后,搭建web服务发布出去