基于 springBoot+jsoup一 || 爬取全国行政区划数据

Posted kevin_ying

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于 springBoot+jsoup一 || 爬取全国行政区划数据相关的知识,希望对你有一定的参考价值。

一、代码演示

如果中途中断,可进行刷选过滤已拉取省份数据

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 19:37
 */
@RestController
public class CityController {

    @Autowired
    private ProvinceService provinceService;
    @Autowired
    private HttpUtil httpUtil;
    private String yearHref = "";
    private int index;

    // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
    @GetMapping("/start")
    public ResultTemplate<String> spider() throws Exception {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
        String charset = "gb2312";
        Document rootDoc = httpUtil.get(url, charset);

        if (rootDoc == null) {
            return of("fail");
        }
        Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
        // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
        yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
        Document doc = httpUtil.get(yearHref, charset);
        // 遍历所有的省
        Elements provinceElements = doc.getElementsByClass("provincetr");
        for (Element element : provinceElements) {
            Elements aEles = element.select("a");
            for (Element aEle : aEles) {
                String name = aEle.text();
                // 11.html
                String provincesHref = aEle.attr("href");
                String code = provincesHref.substring(0, provincesHref.indexOf("."));
                index = yearHref.lastIndexOf("/") + 1;
                // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
                provincesHref = yearHref.substring(0, index) + provincesHref;
                DicProvince province = new DicProvince()
                        .setProvinceName(name)
                        .setProvinceCode(code)
                        .setCountryId(1196612453660643329L)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
                    System.out.println("未执行市:" + name);
                } else {
                    System.out.println("开始时间:" + LocalDateTime.now());
                    System.out.println("省名称:" + name);
                    Long id = provinceService.insertProvince(province);
                    getCites(provincesHref, charset, id);
                }
            }
        }
        return of("spider crawl end.");
    }

    private void getCites(String url, String charset, Long provinceId) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("请求网页链接报错");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("citytr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
                String name = aEle.text();
                // 11/1101.html
                String cityHref = aEle.attr("href");
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                cityHref = yearHref.substring(0, index) + cityHref;
                DicCity city = new DicCity()
                        .setCityName(name)
                        .setCityCode(code)
                        .setProvinceId(provinceId)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCity(city);
                //Long id=1L;

                getDistrict(cityHref, charset, id);
            }
        }
    }

    // 区县
    private void getDistrict(String url, String charset, Long idDis) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("请求网页链接报错");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("countytr");
            for (Element cityElement : cityElements) {
                try {
                    Element aEle = cityElement.select("a").get(1);
                    String name = aEle.text();
                    String cityHref = aEle.attr("href");
                    int start = cityHref.lastIndexOf("/") + 1;
                    String code = cityHref.substring(start, cityHref.indexOf("."));

                    int index = url.lastIndexOf("/") + 1;
                    cityHref = url.substring(0, index) + cityHref;

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertDistrict(district);
                    //Long id=1L;
                    getStreet(cityHref, charset, id);
                } catch (Exception e) {
                    System.out.println("市辖区");
                    Element aEle = cityElement.select("td").get(0);
                    String code = aEle.text();

                    Element aEle2 = cityElement.select("td").get(1);
                    String name = aEle2.text();

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
                    Long id = provinceService.insertDistrict(district);
                    System.out.println("执行完毕");

                }

            }
        }
    }

    // 街道
    private void getStreet(String url, String charset, Long idStr) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("请求网页链接报错");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("towntr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
                String name = aEle.text();
                String cityHref = aEle.attr("href");
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                int index = url.lastIndexOf("/") + 1;
                cityHref = url.substring(0, index) + cityHref;
                DicStreet street = new DicStreet()
                        .setStreetName(name)
                        .setStreetCode(code)
                        .setDistrictId(idStr)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertStreet(street);
                //Long id=1L;
                getCommunity(cityHref, charset, id);
            }
        }
    }

    // 社区
    private void getCommunity(String url, String charset, Long idPro) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("请求网页链接报错");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("villagetr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("td").get(0);
                String code = aEle.text();

                Element aEle2 = cityElement.select("td").get(1);
                String cl_code = aEle2.text();

                Element aEle3 = cityElement.select("td").get(2);
                String name = aEle3.text();

                DicCommunity community = new DicCommunity()
                        .setCommunityName(name)
                        .setCommunityCode(code)
                        .setClassificationCode(cl_code)
                        .setStreetId(idPro)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCommunity(community);
            }
        }
    }

}

 

二、HttppUtil工具类

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-20 9:17
 */
@Component
public class HttpUtil {
    public Document get(String url, String charset) throws IOException {
        String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
        URL url2 = new URL(url);
        HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
        connection.setRequestMethod("GET");
        //是否允许缓存,默认true。
        connection.setUseCaches(Boolean.FALSE);
        //设置请求头信息
        connection.addRequestProperty("Connection", "close");
        connection.addRequestProperty("user-agent", userAgent);
        //设置连接主机超时(单位:毫秒)
        connection.setConnectTimeout(80000);
        //设置从主机读取数据超时(单位:毫秒)
        connection.setReadTimeout(80000);
        //开始请求
        try {
            Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
            return doc;
        } catch (Exception e) {
            System.out.println("parse error: " + url);
        }
        return null;
    }

}

 

三、service部分,根据需要自行定义数据库表

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 20:41
 */
@Service
public class ProvinceServiceImpl implements ProvinceService {

    @Autowired
    private ProvinceMapper provinceMapper;
    @Autowired
    private CityMapper cityMapper;
    @Autowired
    private DistrictMapper districtMapper;
    @Autowired
    private StreetMapper streetMapper;
    @Autowired
    private CommunityMapper communityMapper;


    @Override
    public Long insertProvince(DicProvince dicProvince) {
        int res=0;
        while (res!=1){
            try {
                res=provinceMapper.insert(dicProvince);
            } catch (Exception e) {
                res=0;
                System.out.println("插入省数据失败");
                e.printStackTrace();
            }
        }
        return dicProvince.getProvinceId();
    }

    @Override
    public Long insertCity(DicCity dicCity) {
        int res=0;
        while(res!=1){
            try {
                res=cityMapper.insert(dicCity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入市数据失败");
                e.printStackTrace();
            }
        }
        return dicCity.getCityId();
    }


    @Override
    public Long insertDistrict(DicDistrict dicDistrict) {
        int res=0;
        while (res!=1){
            try {
                res=districtMapper.insert(dicDistrict);
            } catch (Exception e) {
                res=0;
                System.out.println("插入区县数据失败");
                e.printStackTrace();
            }
        }
        return dicDistrict.getDistrictId();
    }

    @Override
    public Long insertStreet(DicStreet dicStreet) {
        int res=0;
        while (res!=1){
            try {
                res=streetMapper.insert(dicStreet);
            } catch (Exception e) {
                res=0;
                System.out.println("插入街道数据失败");
                e.printStackTrace();
            }
        }
        return dicStreet.getStreetId();
    }

    @Override
    public Long insertCommunity(DicCommunity dicCommunity) {
        int res=0;
        while (res!=1){
            try {
                res=communityMapper.insert(dicCommunity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入社区数据失败");
                e.printStackTrace();
            }
        }
        return dicCommunity.getCommunityId();
    }


}

  

以上是关于基于 springBoot+jsoup一 || 爬取全国行政区划数据的主要内容,如果未能解决你的问题,请参考以下文章

java爬虫---爬虫+基于接口的网络爬虫

JAVA网络爬爬学习之HttpClient+Jsoup

java jsoup怎样爬取特定网页内的数据

Jsoup-简单爬取知乎推荐页面(附:get_agent())

java爬虫,网页简易爬小说程序

Java爬虫实战:Jsoup+WebClient实现音乐爬取~