python 爬虫获取文件式网站资源完整版(基于python 3.6)
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫获取文件式网站资源完整版(基于python 3.6)相关的知识,希望对你有一定的参考价值。
<--------------------------------下载函数----------------------------->
import requests
import threading
# 传入的命令行参数,要下载文件的url
# url = ‘http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/fix/cbofs/nos.cbofs.romsgrid.nc‘
def Handler(start, end, url, filename,address):
headers = {‘Range‘: ‘bytes=%d-%d‘ % (start, end)}
r = requests.get(url, headers=headers, stream=True)
# 写入文件对应位置
with open(address+filename, "r+b") as fp:
fp.seek(start)
var = fp.tell()
fp.write(r.content)
def download_file(address, url, num_thread=500):
r = requests.head(url)
try:
file_name = url.split(‘/‘)[-1]
file_size = int(
r.headers[‘content-length‘]) # Content-Length获得文件主体的大小,当http服务器使用Connection:keep-alive时,不支持Content-Length
except:
print("检查URL,或不支持对线程下载")
return
# 创建一个和要下载文件一样大小的文件
fp = open(address+file_name, "wb")
fp.truncate(file_size)
fp.close()
# 启动多线程写文件
part = file_size // num_thread # 如果不能整除,最后一块应该多几个字节
for i in range(num_thread):
start = part * i
if i == num_thread - 1: # 最后一块
end = file_size
else:
end = start + part
t = threading.Thread(target=Handler, kwargs={‘start‘: start, ‘end‘: end, ‘url‘: url, ‘filename‘: file_name,‘address‘:address})
t.setDaemon(True)
t.start()
# 等待所有线程下载完成
main_thread = threading.current_thread()
for t in threading.enumerate():
if t is main_thread:
continue
t.join()
print(‘%s 下载完成‘ % file_name)
# if __name__ == ‘__main__‘:
# start = datetime.datetime.now().replace(microsecond=0)
# download_file(url)
# end = datetime.datetime.now().replace(microsecond=0)
# print("用时: ", end=‘‘)
# print(end - start)
<-------------------链接函数-------------------------->
def get_link(page): # 寻找链接的href
linkData = []
for page in page.find_all(‘td‘):
links = page.select("a")
for each in links:
# if str(each.get(‘href‘))[:1] == ‘/‘: 过滤if代码
data=each.get(‘href‘)
linkData.append(data)
return(linkData)
<---------------------各类函数----------------->
import urllib.request
from bs4 import BeautifulSoup
from findLinks import get_link
from Download import download_file
import os
import datetime
import time
import errno
def mkdir_p(path): #递归创建多级目录
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
# def file_Down(connet,file): #小文件下载模块
# urllib.request.urlretrieve(connet, file, Schedule)
def decice(data): #通过判断斜杠,来进行区分文件及文件夹
a = ‘/‘
if a in data:
return 1
else:
return 0
def gain(url):
page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page, ‘lxml‘) #利用soup获取网页内容
links = get_link(soup) #获取<a href= ? 内容
return links
def take(links,file,file_cre,connet):
if decice(links):
mkdir_p(file)
else:
start = datetime.datetime.now().replace(microsecond=0)
download_file(file_cre, connet)
end = datetime.datetime.now().replace(microsecond=0)
# Handler(start, end, connet, links[childLink],file_cre1)
print("用时: ", end=‘‘)
print(end - start)
<-----------主函数------------->
from urllib.parse import urljoin
from Carriage import decice
from Carriage import gain
from Carriage import take
import os
import time
def findAll(): #主函数
url=‘http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/‘
links=gain(url)
print(‘扒取网址:‘+url)
for childLink in range(len(links)-1):
childLink =childLink +1
connet = urljoin(url, links[childLink]) #拼接网址路径
file = os.path.join(‘D:\\Info\\Index‘ + "/" + links[childLink]) #拼接绝对路径
file_cre1 = os.path.join(‘D:\\Info\\Index‘ + "/")
print(connet)
take(links[childLink], file, file_cre1, connet)
if decice(links[childLink]):
link_next = gain(connet) # 第2次链接内的<a href=?
else:
continue
print("Start : %s" % time.ctime())
time.sleep(5)
print("End : %s" % time.ctime())
for child_next in range(len(link_next)-1):
child_next =child_next +1
connet_next=urljoin(connet,link_next[child_next]) #拼接网址路径
fileF = os.path.join(file,link_next[child_next]) #拼接路径
file_cre2 = file
print(connet_next)
take(link_next[child_next], fileF, file_cre2, connet_next)
if decice(link_next[child_next]):
link_nextF = gain(connet_next) # 第3次链接内的<a href=?
else:
continue
print("Start : %s" % time.ctime())
time.sleep(5)
print("End : %s" % time.ctime())
for child_nextT in range(len(link_nextF )-1):
child_nextT = child_nextT + 1
connet_nextT = urljoin(connet_next, link_nextF[child_nextT])
fileT = os.path.join(fileF,link_nextF[child_nextT] )
file_cre3=fileF
print(connet_nextT)
take(link_nextF[child_nextT], fileT, file_cre3, connet_nextT)
if decice(link_nextF[child_nextT]):
link_nextT = gain(connet_nextT)
else:
continue
for child_nextTh in range(len(link_nextT )-1):
child_nextTh = child_nextTh + 1
connet_nextTh = urljoin(connet_nextT, link_nextT[child_nextTh])
fileTh = os.path.join(fileF,link_nextT[child_nextTh] )
file_cre4=fileT
print(connet_nextTh)
take(link_nextT[child_nextTh], fileTh, file_cre4, connet_nextTh)
if decice(link_nextT[child_nextTh]):
link_nextTh = gain(connet_nextTh)
else:
continue
for child_nextFo in range(len(link_nextTh) - 1):
child_nextFo = child_nextFo + 1
connet_nextFo = urljoin(connet_nextTh, link_nextTh[child_nextFo])
fileFo = os.path.join(fileF, link_nextTh[child_nextFo])
file_cre5 = fileTh
print(connet_nextFo)
take(link_nextTh[child_nextFo], fileFo, file_cre5, connet_nextFo)
if decice(link_nextTh[child_nextFo]):
link_nextFo = gain(connet_nextFo)
else:
continue
for child_nextFi in range(len(link_nextFo) - 1):
child_nextFi = child_nextFi + 1
connet_nextFi = urljoin(connet_nextFo, link_nextFo[child_nextFi])
fileFi = os.path.join(fileFo, link_nextFo[child_nextFi])
file_cre6 = fileFo
print(connet_nextFi)
take(link_nextFo[child_nextFi], fileFi, file_cre6, connet_nextFi)
if decice(link_nextFo[child_nextFi]):
link_nextFi = gain(connet_nextFi)
else:
continue
for child_nextSi in range(len(link_nextFi) - 1):
child_nextSi = child_nextSi + 1
connet_nextSi = urljoin(connet_nextFi, link_nextFi[child_nextSi])
fileSi = os.path.join(fileFi, link_nextFi[child_nextSi])
file_cre7 = fileFi
print(connet_nextSi)
take(link_nextFi[child_nextSi], fileSi, file_cre7, connet_nextSi)
if decice(link_nextFi[child_nextSi]):
link_nextSi = gain(connet_nextSi)
else:
continue
for child_nextSe in range(len(link_nextSi) - 1):
child_nextSe = child_nextSe + 1
connet_nextSe = urljoin(connet_nextSi, link_nextSi[child_nextSe])
fileSe = os.path.join(fileSi, link_nextSi[child_nextSe])
file_cre8 = fileSi
print(connet_nextSe)
take(link_nextSi[child_nextSe], fileSe, file_cre8, connet_nextSe)
if decice(link_nextSi[child_nextSe]):
link_nextSe = gain(connet_nextSe)
else:
continue
<————————————————————main函数——————————————————————>
from way import findAll
if __name__ == ‘__main__‘:
findAll()
以上是关于python 爬虫获取文件式网站资源完整版(基于python 3.6)的主要内容,如果未能解决你的问题,请参考以下文章