python学习 —— 建立IP代理池

Posted darkchii

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python学习 —— 建立IP代理池相关的知识,希望对你有一定的参考价值。

  代码:

from bs4 import BeautifulSoup
from requests import Session, get, post
from time import sleep
import random
import re, os


class ProxyIpPool(object):

    def __init__(self,page):
        object.__init__(self)
        self.page = page

    def init_proxy_ip_pool(self):
        url = ‘https://www.kuaidaili.com/free/‘

        tablelist = [‘IP‘, ‘PORT‘, ‘类型‘, ‘位置‘]

        ip = []
        port = []
        type = []
        position = []

        r = Session()

        headers = {
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
            ‘Accept-Encoding‘: ‘gzip, deflate, br‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘,
            ‘Connection‘:‘keep-alive‘,
            ‘Host‘: ‘www.kuaidaili.com‘,
            # ‘Referer‘: url, # 点击下一页时 每一页的referer对应的url为:从前一页的link来到当前页的那个link。比如:从百度进入代理IP第一页时的referer的url就是百度的link
            ‘Upgrade-Insecure-Requests‘: ‘1‘,
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36‘
        }

        if self.page > 1:
            url = url + ‘inha/‘ + str(self.page) + ‘/‘

        request = r.get(url,headers=headers,timeout=2,)
        print(request.status_code)
        soup = BeautifulSoup(request.text, ‘lxml‘)
        tags = soup.find_all(‘td‘, attrs={‘data-title‘: tablelist})

        # 获取所有IP
        ip_tag_match = re.compile(r‘data-title="IP">(.+?)</td‘)
        ip.append(ip_tag_match.findall(str(tags)))

        # 获取所有端口
        port_tag_match = re.compile(r‘data-title="PORT">(.+?)</td‘)
        port.append(port_tag_match.findall(str(tags)))

        # 获取所有类型
        type_match = re.compile(r‘data-title="类型">(.+?)</td‘)
        type.append(type_match.findall(str(tags)))

        # 获取所有位置
        position_tag_match = re.compile(r‘data-title="位置">(.+?)</td‘)
        position.append(position_tag_match.findall(str(tags)))
        sleep(random.random()*7)

        # ip、port、type、position作为字典保存
        data_title = {‘ip‘: ip, ‘port‘: port, ‘type‘: type, ‘position‘: position}
        return data_title


def create_proxy_ip_pool(page):

    pool = ProxyIpPool(page).init_proxy_ip_pool()

    print(‘初始化完成!开始创建代理池...‘)

    iplist = pool.get(‘ip‘)
    portlist = pool.get(‘port‘)
    typelsit = pool.get(‘type‘)
    positionlist = pool.get(‘position‘)

    for i in range(0, len(iplist[0])):
        print(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i])
        try:
            with open(‘C:/Users/adimin/Desktop/proxyip.txt‘,‘a‘) as fp:
                fp.write(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i] + ‘\\r\\n‘)
        except FileExistsError as err:
            print(err)
            os._exit(2)

if __name__ == ‘__main__‘:
    print(‘正在初始化代理池...请耐心等待...‘)

    print(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘))
    try:
        with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘a‘) as fp:
            fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\\r\\n‘)
    except:
        with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘w‘) as fp:
            fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\\r\\n‘)

    # 不知道为什么只能在外面循环才能爬取多页的IP 如果把代码改为在init_proxy_ip_pool函数中进行循环 则只能爬一页多一点...
    for i in range(1,2177):
        create_proxy_ip_pool(i)

  运行结果:

技术分享图片

  保存到本地:

技术分享图片

 

以上是关于python学习 —— 建立IP代理池的主要内容,如果未能解决你的问题,请参考以下文章

python 爬虫 ip池怎么做

整个大活,采集8个代理IP站点,为Python代理池铺路,爬虫120例之第15例

整个大活,采集8个代理IP站点,为Python代理池铺路,爬虫120例之第15例

python使用redis实现ip代理池

python 爬虫 ip池怎么做

Python_01_IP代理池_实现代理池的检测模块