文章目录


前言

为了示范,我将给出一个完整的Java示例,展示如何使用Jsoup库抓取中华人民共和国民政部网站上的行政区划信息,并将其解析为结构化的SQL插入语句。以下代码可以作为你实现这个功能的参考。


提示:以下是本篇文章正文内容,下面案例可供参考

Java代码示例

步骤1:添加依赖项
如果你使用Maven,请在pom.xml文件中添加Jsoup依赖项:

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.14.3</version>
</dependency>

步骤2:编写Java代码
以下是完整的Java代码,用于抓取和解析行政区划信息,并生成SQL插入语句:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.net.ssl.*;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.security.SecureRandom;
import java.security.cert.X509Certificate;

public class RegionSqlGenerator {

    public void getRegionSql() throws Exception {
        SSLUtilities.disableSSLVerification();
        String url = "https://www.mca.gov.cn/mzsj/xzqh/2023/202301xzqh.html";
        int count = 0;

        Document doc = Jsoup.connect(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36")
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
                .maxBodySize(0)
                .timeout(100000)
                .get();

        Elements trs = doc.select("tr");

        StringBuilder sqlBuilder = new StringBuilder();

        for (Element tr : trs) {
            Elements tds = tr.select("td");

            if (tds.size() > 3) {
                String regionCode = tds.get(1).text();
                String regionArea = tds.get(2).text();
                String parentCode = "";

                if (validCode(regionCode)) {
                    int levelType = 2;

                    parentCode = regionCode.substring(0, 2) + "0000";

                    if (!regionCode.endsWith("00")) {
                        levelType = 3;
                        parentCode = regionCode.substring(0, 4) + "00";
                    }

                    if (regionCode.endsWith("0000")) {
                        levelType = 1;
                        parentCode = "000000";
                    }

                    count++;
                    String sql = String.format("INSERT INTO region_code (code, name, level, parent_code, dtime, note, ctime) " +
                                    "VALUES ('%s', '%s', %d, '%s', '201903', '系统生成', NOW());%s",
                            regionCode, regionArea, levelType, parentCode, System.lineSeparator());

                    sqlBuilder.append(sql);
                }
            }
        }

        writeToFile(sqlBuilder.toString(), "region_code.sql");
        System.out.println("总数量为:" + count);
    }

    private boolean validCode(String code) {
        // 验证代码是否有效,例如长度是否为6,且只包含数字
        return code.matches("\\d{6}");
    }

    private void writeToFile(String content, String fileName) {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(fileName))) {
            writer.write(content);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        try {
            new RegionSqlGenerator().getRegionSql();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

/**
 * 禁用SSL验证
 *
 */
class SSLUtilities {
    public static void disableSSLVerification() {
        try {
            TrustManager[] trustAllCertificates = new TrustManager[]{
                    new X509TrustManager() {
                        public X509Certificate[] getAcceptedIssuers() {
                            return null;
                        }

                        public void checkClientTrusted(X509Certificate[] certs, String authType) {
                        }

                        public void checkServerTrusted(X509Certificate[] certs, String authType) {
                        }
                    }
            };

            SSLContext sslContext = SSLContext.getInstance("TLS");
            sslContext.init(null, trustAllCertificates, new SecureRandom());
            HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());

            HostnameVerifier allHostsValid = new HostnameVerifier() {
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            };

            HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

在这里插入图片描述

Logo

DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐