基于 springBoot+jsoup一 || 爬取全国行政区划数据
Posted kevin_ying
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于 springBoot+jsoup一 || 爬取全国行政区划数据相关的知识,希望对你有一定的参考价值。
一、代码演示
如果中途中断,可进行刷选过滤已拉取省份数据
/** * TODO * * @author kevin * @createTime 2019-11-18 19:37 */ @RestController public class CityController { @Autowired private ProvinceService provinceService; @Autowired private HttpUtil httpUtil; private String yearHref = ""; private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"}; @GetMapping("/start") public ResultTemplate<String> spider() throws Exception { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"; String charset = "gb2312"; Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) { return of("fail"); } Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0); // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接 Document doc = httpUtil.get(yearHref, charset); // 遍历所有的省 Elements provinceElements = doc.getElementsByClass("provincetr"); for (Element element : provinceElements) { Elements aEles = element.select("a"); for (Element aEle : aEles) { String name = aEle.text(); // 11.html String provincesHref = aEle.attr("href"); String code = provincesHref.substring(0, provincesHref.indexOf(".")); index = yearHref.lastIndexOf("/") + 1; // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html provincesHref = yearHref.substring(0, index) + provincesHref; DicProvince province = new DicProvince() .setProvinceName(name) .setProvinceCode(code) .setCountryId(1196612453660643329L) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) { System.out.println("未执行市:" + name); } else { System.out.println("开始时间:" + LocalDateTime.now()); System.out.println("省名称:" + name); Long id = provinceService.insertProvince(province); getCites(provincesHref, charset, id); } } } return of("spider crawl end."); } private void getCites(String url, String charset, Long provinceId) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("citytr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); // 11/1101.html String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); cityHref = yearHref.substring(0, index) + cityHref; DicCity city = new DicCity() .setCityName(name) .setCityCode(code) .setProvinceId(provinceId) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCity(city); //Long id=1L; getDistrict(cityHref, charset, id); } } } // 区县 private void getDistrict(String url, String charset, Long idDis) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("countytr"); for (Element cityElement : cityElements) { try { Element aEle = cityElement.select("a").get(1); String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertDistrict(district); //Long id=1L; getStreet(cityHref, charset, id); } catch (Exception e) { System.out.println("市辖区"); Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis); Long id = provinceService.insertDistrict(district); System.out.println("执行完毕"); } } } } // 街道 private void getStreet(String url, String charset, Long idStr) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("towntr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicStreet street = new DicStreet() .setStreetName(name) .setStreetCode(code) .setDistrictId(idStr) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertStreet(street); //Long id=1L; getCommunity(cityHref, charset, id); } } } // 社区 private void getCommunity(String url, String charset, Long idPro) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("villagetr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2); String name = aEle3.text(); DicCommunity community = new DicCommunity() .setCommunityName(name) .setCommunityCode(code) .setClassificationCode(cl_code) .setStreetId(idPro) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCommunity(community); } } } }
二、HttppUtil工具类
/** * TODO * * @author kevin * @createTime 2019-11-20 9:17 */ @Component public class HttpUtil { public Document get(String url, String charset) throws IOException { String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"; URL url2 = new URL(url); HttpURLConnection connection = (HttpURLConnection)url2.openConnection(); connection.setRequestMethod("GET"); //是否允许缓存,默认true。 connection.setUseCaches(Boolean.FALSE); //设置请求头信息 connection.addRequestProperty("Connection", "close"); connection.addRequestProperty("user-agent", userAgent); //设置连接主机超时(单位:毫秒) connection.setConnectTimeout(80000); //设置从主机读取数据超时(单位:毫秒) connection.setReadTimeout(80000); //开始请求 try { Document doc = Jsoup.parse(connection.getInputStream(), charset, url); return doc; } catch (Exception e) { System.out.println("parse error: " + url); } return null; } }
三、service部分,根据需要自行定义数据库表
/** * TODO * * @author kevin * @createTime 2019-11-18 20:41 */ @Service public class ProvinceServiceImpl implements ProvinceService { @Autowired private ProvinceMapper provinceMapper; @Autowired private CityMapper cityMapper; @Autowired private DistrictMapper districtMapper; @Autowired private StreetMapper streetMapper; @Autowired private CommunityMapper communityMapper; @Override public Long insertProvince(DicProvince dicProvince) { int res=0; while (res!=1){ try { res=provinceMapper.insert(dicProvince); } catch (Exception e) { res=0; System.out.println("插入省数据失败"); e.printStackTrace(); } } return dicProvince.getProvinceId(); } @Override public Long insertCity(DicCity dicCity) { int res=0; while(res!=1){ try { res=cityMapper.insert(dicCity); } catch (Exception e) { res=0; System.out.println("插入市数据失败"); e.printStackTrace(); } } return dicCity.getCityId(); } @Override public Long insertDistrict(DicDistrict dicDistrict) { int res=0; while (res!=1){ try { res=districtMapper.insert(dicDistrict); } catch (Exception e) { res=0; System.out.println("插入区县数据失败"); e.printStackTrace(); } } return dicDistrict.getDistrictId(); } @Override public Long insertStreet(DicStreet dicStreet) { int res=0; while (res!=1){ try { res=streetMapper.insert(dicStreet); } catch (Exception e) { res=0; System.out.println("插入街道数据失败"); e.printStackTrace(); } } return dicStreet.getStreetId(); } @Override public Long insertCommunity(DicCommunity dicCommunity) { int res=0; while (res!=1){ try { res=communityMapper.insert(dicCommunity); } catch (Exception e) { res=0; System.out.println("插入社区数据失败"); e.printStackTrace(); } } return dicCommunity.getCommunityId(); } }
以上是关于基于 springBoot+jsoup一 || 爬取全国行政区划数据的主要内容,如果未能解决你的问题,请参考以下文章