python 鐖彇涔屼簯鎵€鏈夊巶鍟嗗悕瀛楋紝url锛屾紡娲炴€绘暟 骞跺瓨鍏ユ暟鎹簱
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 鐖彇涔屼簯鎵€鏈夊巶鍟嗗悕瀛楋紝url锛屾紡娲炴€绘暟 骞跺瓨鍏ユ暟鎹簱相关的知识,希望对你有一定的参考价值。
鏍囩锛?/p>
闇€瑕侊細mysqldb
涓嬮潰鏄暟鎹〃缁撴瀯锛?/p>
/* Navicat MySQL Data Transfer Source Server : 127.0.0.1 Source Server Version : 50509 Source Host : 127.0.0.1:3306 Source Database : wooyun Target Server Type : MYSQL Target Server Version : 50509 File Encoding : 65001 Date: 2015-09-24 17:38:14 */ SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for wooyun_vul -- ---------------------------- DROP TABLE IF EXISTS `wooyun_vul`; CREATE TABLE `wooyun_vul` ( `id` int(8) NOT NULL AUTO_INCREMENT, `corpsname` varchar(255) DEFAULT NULL, `corpsurl` varchar(255) DEFAULT NULL, `vulcount` int(255) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
python 鑴氭湰锛?/p>
#conding=utf-8 import urllib2 import urllib import re import MySQLdb url = "http://wooyun.org/corps/page/" def getWooyuncorps(url): request = urllib2.Request(url) request.add_header(鈥楿ser-Agent鈥?鈥楳ozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/41.0.2272.89 Safari/537.36鈥? reponse = urllib2.urlopen(request) content = reponse.read() pattern1 = re.compile(r鈥?lt;td width="370"><a href="\/corps\/(.*?)">.*?<\/a><\/td>鈥? pattern2 = re.compile(r鈥?lt;a rel="nofollow" href="(.*?)" target=鈥? corps = pattern1.findall(content) corpsUrl = pattern2.findall(content) return corps,corpsUrl def getcorpscount(url): request = urllib2.Request(url) request.add_header(鈥楿ser-Agent鈥?鈥楳ozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36鈥? reponse = urllib2.urlopen(request) content = reponse.read() pattern = re.compile(r鈥?lt;p class="page">.*?(\d+).*鈥? count = pattern.findall(content) return count corpslist = [] corpsurllist = [] countlist = [] for i in range(1,37): corps,corpsUrl = getWooyuncorps(url+str(i)) for corp in corps: corpslist.append(corp) for urls in corpsUrl: corpsurllist.append(urls) print len(corpslist),len(corpsurllist) for i in range(0,len(corpslist)): newurl = "http://www.wooyun.org/corps/"+urllib.quote(corpslist[i]) #print newurl count = getcorpscount(newurl) #print count for countA in count: countlist.append(countA) #print len(countlist) conn = MySQLdb.connect(鈥榣ocalhost鈥?鈥榬oot鈥?鈥樷€?鈥榳ooyun鈥? cur = conn.cursor() sql = "set names 鈥榰tf8鈥? cur.execute(sql) conn.commit() for s in range(0,len(countlist)): sql = 鈥榠nsert into wooyun_vul(corpsname,corpsurl,vulcount) values("%s","%s",%d)鈥?%(corpslist[s],corpsurllist[s],int(countlist[s])) print sql cur.execute(sql) conn.commit() conn.close() print "success"
銆€銆€
以上是关于python 鐖彇涔屼簯鎵€鏈夊巶鍟嗗悕瀛楋紝url锛屾紡娲炴€绘暟 骞跺瓨鍏ユ暟鎹簱的主要内容,如果未能解决你的问题,请参考以下文章