公司有个项目需要五级行政区划,没有现成的数据,写了一段代码,从gj统计j获取的数据。记录一下。

Java爬虫获取省市区镇村5级行政区划-LMLPHP

1.引入maven解析html

<!-- jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

2.Java代码实现

@GetMapping("/hh")
    public void hh(){
        Division d=new Division();
        final String url = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/";
        String provinceurl = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html"; // 需要爬取的目标网站地址
        try {
            Document document = Jsoup.connect(provinceurl).get(); // 获取该网页的文档对象
            String title = document.title(); // 获取页面标题
            //省
            Elements provincetable=document.body().select("tr.provincetr").select("a[href]");
            for (Element province : provincetable) {
                String provinceHref = province.attr("href"); // 获取链接地址
                String provinceText = province.text(); // 获取链接文字
                d.setProvincialCode(provinceHref.replace(".html",""));
                d.setProvincialName(provinceText);
                String cityurl=url+provinceHref;
                //System.out.println("cityurl = " + cityurl);
                Document citytabledocument = Jsoup.connect(cityurl).get();
                //市
                Elements citytable=citytabledocument.body().select("table.citytable").select("a[href]");
                //System.out.println("citytable = " + citytable);
                for(int i=0;i<citytable.size()/2;i++){
                    d.setMunicipalCode(citytable.get(i).text());
                    i=i+1;
                    d.setMunicipalName(citytable.get(i).text());
                    String cityHref = citytable.get(i).attr("href"); // 获取链接地址
                    String countyurl=url+cityHref;
                    System.out.println("countyurl = " + countyurl);
                    Document countytableocument = Jsoup.connect(countyurl).get();
                    //区
                    Elements countytable=countytableocument.body().select("table.countytable").select("a[href]");
                    for(int j=0;j<countytable.size()/2;j++){
                        d.setDistrictCode(countytable.get(j).text());
                        j=j+1;
                        d.setDistrictName(countytable.get(j).text());
                        String countyHref = countytable.get(j).attr("href"); // 获取链接地址
                        String townturl = url + provinceHref.replace(".html", "") + "/" + countyHref;
                        Document townttableocument = Jsoup.connect(townturl).get();
                        //镇
                        Elements towntable = townttableocument.body().select("table.towntable").select("a[href]");
                        for(int k=0;k<towntable.size()/2;k++){
                            d.setStreetTownCode(towntable.get(k).text());
                            k=k+1;
                            d.setStreetTownName(towntable.get(k).text());

                            String towntHref = towntable.get(k).attr("href"); // 获取链接地址
                            String villageurl = townturl.substring(0, townturl.length() - 11) + towntHref;
                            System.out.println("villageurl = " + villageurl);
                            Document villagetabledocument = Jsoup.connect(villageurl).get();
                            //村
                            Elements villagetable = villagetabledocument.body().select("table.villagetable").select("tr.villagetr");
                            for (Element village : villagetable) {
                                String villageText = village.text(); // 获取链接文字
                                String[] vi = villageText.split(" ");
                                System.out.println("统计用区划代码: " + vi[0]);
                                System.out.println("城乡分类代码: " + vi[1]);
                                System.out.println("名称: " + vi[2]);
                                d.setCommunityVillageCode(vi[0]);
                                d.setUrbanRural( vi[1]);
                                d.setCommunityVillageName(vi[2]);
                                System.out.println("d.toString() = " + d.toString());
                                System.out.println("vi = " + vi);
                                divisionService.insertDivision1(d);
                            }
                            //我想让他跑慢点,你可以自己调
                            try {
                                Thread.sleep(2000);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }

                        }

                    }
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

3.用到的实体类

import com.ruoyi.common.utils.StringUtils;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import com.ruoyi.common.annotation.Excel;
import com.ruoyi.common.core.domain.BaseEntity;

/**
 * 行政区划对象 division
 *
 * @author liphui
 * @date 2023-11-17
 */
public class Division extends BaseEntity
{
    private static final long serialVersionUID = 1L;

    /** 省级代码 */
    @Excel(name = "省级代码")
    private String provincialCode;

    /** 省级名称 */
    @Excel(name = "省级名称")
    private String provincialName;

    /** 市级代码 */
    @Excel(name = "市级代码")
    private String municipalCode;

    /** 市级名称 */
    @Excel(name = "市级名称")
    private String municipalName;

    /** 区县代码 */
    @Excel(name = "区县代码")
    private String districtCode;

    /** 区县名称 */
    @Excel(name = "区县名称")
    private String districtName;

    /** 街镇乡代码 */
    @Excel(name = "街镇乡代码")
    private String streetTownCode;

    /** 街镇乡名称 */
    @Excel(name = "街镇乡名称")
    private String streetTownName;

    /** 社区村级代码 */
    @Excel(name = "社区村级代码")
    private String communityVillageCode;

    /** 社区村级名称 */
    @Excel(name = "社区村级名称")
    private String communityVillageName;

    /** 城乡分类 */
    @Excel(name = "城乡分类")
    private String urbanRural;

    public void setProvincialCode(String provincialCode){
        this.provincialCode = provincialCode;
    }
    public String getProvincialCode(){
        return provincialCode;
    }

    public void setProvincialName(String provincialName){
        this.provincialName = provincialName;
    }
    public String getProvincialName(){
        return provincialName;
    }

    public void setMunicipalCode(String municipalCode){
        this.municipalCode = municipalCode;
    }
    public String getMunicipalCode(){
        return municipalCode;
    }

    public void setMunicipalName(String municipalName){
        this.municipalName = municipalName;
    }
    public String getMunicipalName(){
        return municipalName;
    }

    public void setDistrictCode(String districtCode){
        this.districtCode = districtCode;
    }
    public String getDistrictCode(){
        return districtCode;
    }

    public void setDistrictName(String districtName){
        this.districtName = districtName;
    }
    public String getDistrictName(){
        return districtName;
    }

    public void setStreetTownCode(String streetTownCode){
        this.streetTownCode = streetTownCode;
    }
    public String getStreetTownCode(){
        return streetTownCode;
    }

    public void setStreetTownName(String streetTownName){
        this.streetTownName = streetTownName;
    }
    public String getStreetTownName(){
        return streetTownName;
    }

    public void setCommunityVillageCode(String communityVillageCode){
        this.communityVillageCode = communityVillageCode;
    }
    public String getCommunityVillageCode(){
        return communityVillageCode;
    }

    public void setCommunityVillageName(String communityVillageName){
        this.communityVillageName = communityVillageName;
    }
    public String getCommunityVillageName(){
        return communityVillageName;
    }

    public void setUrbanRural(String urbanRural){
        this.urbanRural = urbanRural;
    }
    public String getUrbanRural(){
        return urbanRural;
    }

    public String getDivisionName(){
        StringBuilder stringBuilder = new StringBuilder();
        if (StringUtils.isNotEmpty(this.provincialName)){
            stringBuilder.append(this.provincialName);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.municipalName)){
            stringBuilder.append(",").append(this.municipalName);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.districtName)){
            stringBuilder.append(",").append(this.districtName);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.streetTownName)){
            stringBuilder.append(",").append(this.streetTownName);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.communityVillageName)){
            stringBuilder.append(",").append(this.communityVillageName);
        }else {
            return stringBuilder.toString();
        }
        return stringBuilder.toString();
    }

    public String getDivisionCode(){
        StringBuilder stringBuilder = new StringBuilder();
        if (StringUtils.isNotEmpty(this.provincialCode)){
            stringBuilder.append(this.provincialCode);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.municipalCode)){
            stringBuilder.append(",").append(this.municipalCode);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.districtCode)){
            stringBuilder.append(",").append(this.districtCode);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.streetTownCode)){
            stringBuilder.append(",").append(this.streetTownCode);
        }else {
            return stringBuilder.toString();
        }
        if (StringUtils.isNotEmpty(this.communityVillageCode)){
            stringBuilder.append(",").append(this.communityVillageCode);
        }else {
            return stringBuilder.toString();
        }
        return stringBuilder.toString();
    }

    @Override
    public String toString() {
        return new ToStringBuilder(this,ToStringStyle.MULTI_LINE_STYLE)
            .append("provincialCode", getProvincialCode())
            .append("provincialName", getProvincialName())
            .append("municipalCode", getMunicipalCode())
            .append("municipalName", getMunicipalName())
            .append("districtCode", getDistrictCode())
            .append("districtName", getDistrictName())
            .append("streetTownCode", getStreetTownCode())
            .append("streetTownName", getStreetTownName())
            .append("communityVillageCode", getCommunityVillageCode())
            .append("communityVillageName", getCommunityVillageName())
            .append("urbanRural", getUrbanRural())
            .toString();
    }
}

其他的代码不贴了,就是数据入库。

01-06 14:53