文本分类
Posted longyongzhen
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了文本分类相关的知识,希望对你有一定的参考价值。
#-*- coding:utf-8 -*-
import re
import requests
class Spider:
#页面初始化
def __init__(self):
self.url = ‘http://gz.meituan.com/category/meishi?mtt=1.index%2Ffloornew.nc.1.irj38puy‘
def get_class_index(self):
r = requests.get(self.url)
#print r.encoding
#print r.text.encode(‘UTF-8‘)
pattern1 = re.compile(r‘<div class="label has-icon">分类:</div>(.*?)</div>‘,re.S)
items = re.findall(pattern1,r.text.encode(‘UTF-8‘))
pattern2 = re.compile(r‘<li.*?href="(.*?)">(.*?)</a></li>‘,re.S)
items2 = re.findall(pattern2,items[0])
f = open(‘mt_class_index.txt‘,"w+")
llink=[]
lname=[]
lclass=[]
for i in range(len(items2)):
x,y=items2[i]
llink.append(x)
lname.append(y)
lclass.append(str(i+1))
f.write(str(i+1)+‘,‘+x+‘,‘+y+‘\n‘)
f.close()
result=zip(lclass,llink,lname)
return result
def getEverryClass(self,link):
r = requests.get(link)
pattern1 = re.compile(r‘<div class="paginator-wrapper">(.*?)</div>‘,re.S)
items = re.findall(pattern1,r.text.encode(‘UTF-8‘))
pattern2 = re.compile(r‘<li.*?href="(.*?)".*?</li>‘,re.S)
pattern3 = re.compile(r‘<i class="icon icon-shangjia">.*?<a class="link f3 J-mtad-link".*?target="_blank">(.*?)</a>‘,