python 用于从URL获取链接的Python脚本。
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 用于从URL获取链接的Python脚本。相关的知识,希望对你有一定的参考价值。
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
import time
from queue import *
import threading
from worker import Worker
from sqlworker import sqlWorker
import pymysql
class spider():
def __init__(self):
# set vars
self.visitedLinks = set()
self.allExtLinks = Queue()
self.maxThreads = 10
self.workers = []
self.running = True
#create db connection
self.initDb()
#initial Link
self.startLink()
#run the spider
self.run()
def initDb(self):
conn = pymysql.connect(host='', unix_socket='/tmp/mysql.sock', user='', passwd='', db='')
self.cur = conn.cursor(pymysql.cursors.DictCursor)
self.cur.execute('USE `0090-scraping`')
def randomSeed(self):
self.cur.execute('select url from urls order by rand() limit 1')
return self.cur.fetchone()['url']
def startLink(self):
self.allExtLinks.put({
'url': self.seedUrl()
})
def seedUrl(self):
return 'http://www.reddit.com/'
return 'http://www.bbc.co.uk'
return 'http://shopping.indiatimes.com/lifestyle/bed-linen/8-designer-rajasthani-cotton-double-bed-sheets-with-16-pillow-covers/11574/p_B4661019'
def createWorker(self, allExtLinks, theadNum, cur, visitedLinks):
return Worker(allExtLinks, theadNum, cur, visitedLinks)
def getUniques():
def run(self):
#we have 1 active link
activeThreads = 1
#the sql worker
self.pending = Queue()
## waiting for output
print ("Spider: Waiting...")
# create the sql woorker thread
self.sW = sqlWorker( self.pending, self.cur )
self.sW.start()
#while we are running
while self.running :
#show the the loop is running
print(' ')
print(' -------- Ext Links ' + str(self.allExtLinks.qsize()) + ', Threads: ' + str(threading.activeCount()) + ' ----------' )
print(' ')
#if thread count < max - start new thread
if threading.activeCount() < self.maxThreads:
w = self.createWorker( self.allExtLinks, activeThreads, self.cur, self.visitedLinks )
activeThreads = activeThreads + 1
self.workers.append(w)
w.start()
#end the dead workers
for w in self.workers:
#if the worker is still running
if( w.isAlive() == True):
#get the workers visited links
for i in w.getVisitedLinks():
self.visitedLinks.add(i)
#add all of the visited linsk to the worker thread
w.setVisitedLinks(self.visitedLinks)
#append the waiting data
for i in w.getUrlDetails():
self.pending.put(i)
# join the dead threads and count
if( w.isAlive() == False):
w.join()
activeThreads = activeThreads - 1
self.workers.remove(w)
#sleep 1 second per loop
time.sleep(1)
#end the loop if no more
if( self.allExtLinks.empty() == True ):
self.running = False
#join active threads - to end the app
while threading.activeCount()>1:
for w in self.workers:
w.join()
self.sW.join()
## waiting for output
print ("Spider: Complete...")
if __name__ == '__main__':
s = spider();
import sys, traceback
from threading import Thread
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import logging
import re
from queue import *
import pymysql
import time
'''
logging config
'''
logging.basicConfig(level=logging.DEBUG)
class sqlWorker(Thread):
def __init__( self, queue, cur ):
'''
init with the queue and set the logger - this worker is a thread extended class
'''
Thread.__init__(self)
self.cursor = cur
self.queue = queue
self.running = True
def join(self, timeout=None):
'''
when the thread joins send the loop end signal
'''
self.running = False
super(sqlWorker, self).join(timeout)
def saveLink(self, url, title, description):
'''
see if the url exists
'''
self.cursor.execute( 'select * from urls where url = %s', (url) )
r = self.cursor.fetchone()
url_id = 0
if r == None:
self.cursor.execute('insert into urls (url) values (%s)', url)
self.cursor.execute('select LAST_INSERT_ID()')
url_id = self.cursor.fetchone()[0]
else:
url_id = r['urls_id']
'''
see if the title exists
'''
self.cursor.execute('select * from phrazes where phraze = %s', (title))
r = self.cursor.fetchone()
title_id = 0
if r == None:
self.cursor.execute('insert into phrazes (phraze) values (%s)', title)
self.cursor.execute('select LAST_INSERT_ID()')
title_id = self.cursor.fetchone()[0]
else:
title_id = r['phrazes_id']
'''
add title to piviot
'''
self.cursor.execute('select * from url_phraze_pivot where urls_id = %s and phrazes_id = %s', (url_id, title_id))
r = self.cursor.fetchone()
if r == None:
self.cursor.execute('insert into url_phraze_pivot (urls_id, phrazes_id, occurrences) values (%s, %s, 0)', (url_id, title_id))
else:
self.cursor.execute('update url_phraze_pivot set occurrences = occurrences + 1 where urls_id = %s and phrazes_id = %s ', (url_id, title_id))
'''
see if the description exists
'''
self.cursor.execute( 'select * from phrazes where phraze = %s', (description) )
r = self.cursor.fetchone()
description_id = 0
if r == None:
self.cursor.execute('insert into phrazes (phraze) values (%s)', description)
self.cursor.execute('select LAST_INSERT_ID()')
description_id = self.cursor.fetchone()[0]
else:
description_id = r['phrazes_id']
'''
add title to piviot
'''
self.cursor.execute('select * from url_phraze_pivot where urls_id = %s and phrazes_id = %s', (url_id, description_id))
r = self.cursor.fetchone()
if r == None:
self.cursor.execute('insert into url_phraze_pivot (urls_id, phrazes_id, occurrences) values (%s, %s, 0)', (url_id, description_id))
else:
self.cursor.execute('update url_phraze_pivot set occurrences = occurrences +1 where urls_id = %s and phrazes_id = %s ', (url_id, description_id))
print(' ')
print(' -------- save link -------- ')
print(url, title, description)
print(' ')
def run(self):
'''
thread run, sace url
'''
#get the item from the queue
item = self.queue.get()
#while there is an item in the current queue
while self.running == True:
try:
if item != None:
#get the parts
url = item['url']
title = item['title']
description = item['description']
#save the parts
self.saveLink(url, title, description)
time.sleep(2)
#set the next item for the while loop
item = self.queue.get()
except Empty as e:
self.running = False
continue
except Exception as e:
print(' ')
print(' -------- Sql Worker exception ' + self.threadNum + ': -------- ');
print(e)
print(' ')
print("-"*60)
traceback.print_exc(file=sys.stdout)
print("-"*60)
continue
self.queue.task_done()
import sys, traceback
from threading import Thread
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import logging
import re
from queue import *
import pymysql
import time
'''
logging config
'''
logging.basicConfig(level=logging.DEBUG)
class Worker(Thread):
def __init__( self, queue, threadNum, cur, visitedLinks ):
'''
init with the queue and set the logger - this worker is a thread extended class
'''
Thread.__init__(self)
self.threadNum = str(threadNum)
self.cursor = cur
self.visitedLinks = visitedLinks
self.queue = queue;
self.urlDetails = []
self.running = True
def join(self, timeout=None):
'''
when the thread joins send the loop end signal
'''
self.running = False
super(Worker, self).join(timeout)
def getVisitedLinks(self):
'''
return the workers current visited links
'''
return self.visitedLinks
def setVisitedLinks(self, visitedLinks):
'''
add to the workers visited links
'''
for i in visitedLinks:
self.visitedLinks.add(i)
def getCurrentDomain(self, page):
'''
get the domain we are crawling
'''
parsed_uri = urlparse(page)
return parsed_uri.netloc
def getMetaTitle(self, html):
'''
extract the page meta title
'''
if html.title is not None:
return html.title.string.encode('utf-8')
return ""
def getMetaDescription(self, html):
'''
extract the page meta description
'''
description = html.findAll(attrs={"name":"description"})
if len(description) and description[0]['content'] != None:
return description[0]['content'].encode('utf-8')
return ""
def encodeLink(self, link):
'''
attempt to fix encoding issues with links
'''
input = bytes(link, "UTF-8")
link = input.decode("ascii", "ignore")
return link;
def fetch(self, uri):
'''
url open, check the headers for text/html
if so return data
'''
uri = self.encodeLink(uri)
self.visitedLinks.add(uri)
try:
h = urlopen(uri)
x = h.info()
if 'text/html' in x['Content-Type'].lower():
return h.read()
else:
return None
except urllib.error.URLError:
return None
def getUrlDetails(self):
'''
get the list of url urlDetails
'''
return self.urlDetails
def getLinks(self, page, url):
'''
find all anchor linsk within the page
add to either array depending on its http(s) status
'''
internalLinks = [];
externalLinks = [];
currentDomain = self.getCurrentDomain(url)
self.urlparse = urlparse(url)
for link in page.findAll('a', href=True):
if link.has_attr('href'):
if( link['href'].startswith('#') == False and link['href'] != '/' ):
#internal link - rel link
if currentDomain not in link['href'] and link['href'].startswith('http://') == False and link['href'].startswith('https://') == False :
if currentDomain.endswith('/') :
currentDomain = currentDomain[:-1]
if link['href'].startswith('/') :
link['href'] = link['href'][1:]
link['href'] = self.urlparse.scheme + '://' + currentDomain + '/' + link['href']
link = link['href']
internalLinks.append(link)
# external link
elif currentDomain not in link['href'] and ( link['href'].startswith('http://') or link['href'].startswith('https://') ) :
link = link['href']
externalLinks.append(link)
# internal link non rel
elif currentDomain in link['href'] :
if link['href'].startswith('http://') == False and link['href'].startswith('https://') == False:
if currentDomain.endswith('/') :
currentDomain = currentDomain[:-1]
if link['href'].startswith('/') :
link['href'] = link['href'][1:]
link['href'] = self.urlparse.scheme + '://' + currentDomain + '/' + link['href']
link = link['href']
internalLinks.append(link)
return internalLinks, externalLinks
def run(self):
'''
thread run, check url
'''
#get the item from the queue
item = self.queue.get()
#while there is an item in the current queue
while self.running == True:
try:
#the current url
url = item['url']
#make sure we have not yet visited
if( url not in self.visitedLinks ):
#fetch the html
data = self.fetch(url)
if data == None:
#log that we could not get data from the url
#logging.info('[-] Thread: ' + self.threadNum + ' - Could not fetch: %s because type != text/html', url)
a = 2
else:
#log the current url we are scraping
logging.info('[+] Thread: ' + self.threadNum + ' - Success fetched: %s', url)
#create the beautifulSoup object
bsObj = BeautifulSoup(data, 'lxml')
#get the internal and external links
internalLinks, externalLinks = self.getLinks(bsObj, url)
#get the meta title
metaTitle = self.getMetaTitle(bsObj)
#get the meta desciption
metaDescription = self.getMetaDescription(bsObj)
#add to the save queue
self.urlDetails.append({
'url': url,
'title': metaTitle,
'description': metaDescription
})
# only scrape pages that are relative to the start page
for i in internalLinks:
self.queue.put({ 'url' : i })
#add to the queue of external links
for i in externalLinks:
self.queue.put({ 'url' : i })
#have a quick nap
time.sleep(2)
#set the next item for the while loop
item = self.queue.get()
except Empty as e:
print(' ')
print(' -------- Thread empty ' + self.threadNum + ': -------- ');
print(e)
print(' ')
print("-"*60)
traceback.print_exc(file=sys.stdout)
print("-"*60)
self.running = False
except Exception as e:
print(' ')
print(' -------- Thread Running exception ' + self.threadNum + ': -------- ');
print(e)
print(' ')
print("-"*60)
traceback.print_exc(file=sys.stdout)
print("-"*60)
continue
self.queue.task_done()
以上是关于python 用于从URL获取链接的Python脚本。的主要内容,如果未能解决你的问题,请参考以下文章