python url_parse.py

Posted 2021-05-08
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python url_parse.py相关的知识，希望对你有一定的参考价值。
from collections import namedtuple
import urllib.parse
import logging
import re
import html
from urllib.parse import urlunsplit, urljoin, urlparse, urlsplit
import tldextract
urljoin=False
LOGGER = logging.getLogger(__name__)

LOGGER = logging.getLogger(__name__)


def is_absolute_url(url):
    if url[0:2] == '//':
        return True
    if url[0:7].lower() == 'http://' or url[0:8].lower() == 'https://':
        return True
    return False


def clean_webpage_links(link, urljoin=None):
    link = link.strip('\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
                      '\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ')
    m = re.match(r'(?:https?:)?/{3,}', link, re.I)
    if m:
        start = m.group(0)
        link = start.rstrip('/') + '//' + link.replace(start, '', 1)
    m = re.match(r'(?:https?:)?\\{2,}', link, re.I)
    if m:
        start = m.group(0)
        link = start.rstrip('\\') + '//' + link.replace(start, '', 1)
    if is_absolute_url(link):
        start = link.find('://') + 3  # works whether we have a scheme or not
        m = re.search(r'[\\/?#]', link[start:])
        if m:
            if m.group(0) == '\\':
                link = link[0:start] + link[start:].replace('\\', '/', 1)
    if '&' in link:
        link = html.unescape(link)
    if len(link) > 300:  # arbitrary choice
        m = re.match(r'(.*?)[<>\"\'\r\n ]', link)  # rare  in urls and common in html markup
        if m:
            link = m.group(1)
        if len(link) > 2000:
            if link.startswith('javascript:') or link.startswith('data:'):
                return ''
            logstr = link[:50] + '...'
            LOGGER.info('webpage urljoin=%s has an invalid-looking link %s of length %d',
                        str(urljoin), logstr, len(link))
            return ''  # will urljoin to the urljoin
    link = link.replace('\t', '')
    link = link.replace('\r', '')
    link = link.replace('\n', '')
    return link

def unquote(text, safe):
    pieces = text.split('%')
    text = pieces.pop(0)
    for p in pieces:
        if text.endswith('%'):  # deal with %%
            text += '%' + p
            continue
        quote = p[:2]
        rest = p[2:]
        if quote in valid_hex:
            quote = quote.upper()
        if quote in safe:
            text += chr(int(quote, base=16)) + rest
        else:
            text += '%' + quote + rest
    return text


def quote(text, quoteme):
    ret = ''
    for c in text:
        if c in quoteme:
            c = quoteme[c]
        ret += c
    return ret

def safe_url_canonicalization(url):
    original_url = url
    url = unquote(url, unreserved)
    try:
        (scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(url)
    except ValueError:
        LOGGER.info('invalid url %s', url)
        raise
    scheme = scheme.lower()
    if scheme not in ('http', 'https', 'ftp'):
        return original_url, ''
    if path == '':
        path = '/'
    path = path.replace('\\', '/')  # might not be 100% safe but is needed for Windows buffoons
    return urllib.parse.urlunsplit((scheme, netloc, path, query, None)), fragment



def scheme_allowed(url):
    allowed_schemes = set(('http', 'https'))
    if urlsplit(url).scheme not in allowed_schemes:
        return False
    return True


video_extension = set(('3gp', 'af', 'asf', 'avchd', 'avi', 'cam', 'dsh', 'flv', 'm1v', 'm2v',
                       'fla', 'flr', 'm4v', 'mkv', 'sol', 'wrap', 'mng', 'mov', 'mpg', 'mpeg',
                       'mp4', 'mpe', 'mxf', 'nsv', 'ogg', 'rm', 'svi', 'smi', 'wmv', 'webm'))

not_text_extension = set(('jpg', 'jpeg', 'png', 'gif', 'webp', 'svg',
                          'mp3', 'mid', 'midi',
                          'pdf', 'ps',
                          'gz', 'bz2', 'tar', 'tgz', 'zip', 'rar',
                          'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
                          'odt', 'fodt', 'odp', 'fodp', 'ods', 'fods', 'odg', 'fodg', 'odf',
                          'swf'))

text_extension = set(('txt', 'html', 'php', 'htm', 'aspx', 'asp', 'shtml', 'jsp'))
text_embed_extension = set(('js', 'css'))


def extension_allowed(url):
    if urlsplit(url).path:
        if urlsplit(url).path.endswith('/'):
            return True
        _, last_part = urlsplit(url).path.rsplit('/', maxsplit=1)
        if last_part and '.' in last_part:
            _, extension = last_part.rsplit('.', maxsplit=1)
            if extension.lower() in not_text_extension:
                return False
    return True


url = 'https://www.reddit.com'
url = clean_webpage_links(url, urljoin=True)

if extension_allowed(url) and is_absolute_url(url):
    print(url)
以上是关于python url_parse.py的主要内容，如果未能解决你的问题，请参考以下文章