python url_parse.py
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python url_parse.py相关的知识,希望对你有一定的参考价值。
from collections import namedtuple
import urllib.parse
import logging
import re
import html
from urllib.parse import urlunsplit, urljoin, urlparse, urlsplit
import tldextract
urljoin=False
LOGGER = logging.getLogger(__name__)
LOGGER = logging.getLogger(__name__)
def is_absolute_url(url):
if url[0:2] == '//':
return True
if url[0:7].lower() == 'http://' or url[0:8].lower() == 'https://':
return True
return False
def clean_webpage_links(link, urljoin=None):
link = link.strip('\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
'\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ')
m = re.match(r'(?:https?:)?/{3,}', link, re.I)
if m:
start = m.group(0)
link = start.rstrip('/') + '//' + link.replace(start, '', 1)
m = re.match(r'(?:https?:)?\\{2,}', link, re.I)
if m:
start = m.group(0)
link = start.rstrip('\\') + '//' + link.replace(start, '', 1)
if is_absolute_url(link):
start = link.find('://') + 3 # works whether we have a scheme or not
m = re.search(r'[\\/?#]', link[start:])
if m:
if m.group(0) == '\\':
link = link[0:start] + link[start:].replace('\\', '/', 1)
if '&' in link:
link = html.unescape(link)
if len(link) > 300: # arbitrary choice
m = re.match(r'(.*?)[<>\"\'\r\n ]', link) # rare in urls and common in html markup
if m:
link = m.group(1)
if len(link) > 2000:
if link.startswith('javascript:') or link.startswith('data:'):
return ''
logstr = link[:50] + '...'
LOGGER.info('webpage urljoin=%s has an invalid-looking link %s of length %d',
str(urljoin), logstr, len(link))
return '' # will urljoin to the urljoin
link = link.replace('\t', '')
link = link.replace('\r', '')
link = link.replace('\n', '')
return link
def unquote(text, safe):
pieces = text.split('%')
text = pieces.pop(0)
for p in pieces:
if text.endswith('%'): # deal with %%
text += '%' + p
continue
quote = p[:2]
rest = p[2:]
if quote in valid_hex:
quote = quote.upper()
if quote in safe:
text += chr(int(quote, base=16)) + rest
else:
text += '%' + quote + rest
return text
def quote(text, quoteme):
ret = ''
for c in text:
if c in quoteme:
c = quoteme[c]
ret += c
return ret
def safe_url_canonicalization(url):
original_url = url
url = unquote(url, unreserved)
try:
(scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(url)
except ValueError:
LOGGER.info('invalid url %s', url)
raise
scheme = scheme.lower()
if scheme not in ('http', 'https', 'ftp'):
return original_url, ''
if path == '':
path = '/'
path = path.replace('\\', '/') # might not be 100% safe but is needed for Windows buffoons
return urllib.parse.urlunsplit((scheme, netloc, path, query, None)), fragment
def scheme_allowed(url):
allowed_schemes = set(('http', 'https'))
if urlsplit(url).scheme not in allowed_schemes:
return False
return True
video_extension = set(('3gp', 'af', 'asf', 'avchd', 'avi', 'cam', 'dsh', 'flv', 'm1v', 'm2v',
'fla', 'flr', 'm4v', 'mkv', 'sol', 'wrap', 'mng', 'mov', 'mpg', 'mpeg',
'mp4', 'mpe', 'mxf', 'nsv', 'ogg', 'rm', 'svi', 'smi', 'wmv', 'webm'))
not_text_extension = set(('jpg', 'jpeg', 'png', 'gif', 'webp', 'svg',
'mp3', 'mid', 'midi',
'pdf', 'ps',
'gz', 'bz2', 'tar', 'tgz', 'zip', 'rar',
'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'odt', 'fodt', 'odp', 'fodp', 'ods', 'fods', 'odg', 'fodg', 'odf',
'swf'))
text_extension = set(('txt', 'html', 'php', 'htm', 'aspx', 'asp', 'shtml', 'jsp'))
text_embed_extension = set(('js', 'css'))
def extension_allowed(url):
if urlsplit(url).path:
if urlsplit(url).path.endswith('/'):
return True
_, last_part = urlsplit(url).path.rsplit('/', maxsplit=1)
if last_part and '.' in last_part:
_, extension = last_part.rsplit('.', maxsplit=1)
if extension.lower() in not_text_extension:
return False
return True
url = 'https://www.reddit.com'
url = clean_webpage_links(url, urljoin=True)
if extension_allowed(url) and is_absolute_url(url):
print(url)
以上是关于python url_parse.py的主要内容,如果未能解决你的问题,请参考以下文章