public class HttpClient {

    public static int startRecord = 1;
    public static int endRecord = 75;
    public static  int maxPage = 0;
    public static String headUrl = "http://www.ytgxq.gov.cn";


    public static void main(String[] args) {

        try {

            while (maxPage == 0 || endRecord <= maxPage){

                saveMessage();
                startRecord+=75;
                endRecord+=75;
            }

            maxPage = 0;

        }catch (Exception e){
            e.printStackTrace();
        }



    }

    public synchronized static void saveMessage(){
        try {

            //获取列表页的Entity解析的字符串对象
            String listPage = getListPageEntity(startRecord, endRecord);
            if (listPage != null){
                Document listDocument = Jsoup.parse(listPage.replace("<![CDATA[","").replace("]]>",""));
                if (maxPage == 0){
                    String totalrecord = listDocument.select("totalrecord").text();
                    maxPage = Integer.parseInt(totalrecord);
                }

                Elements elements = listDocument.select("record");

                for (Element element : elements) {
                    if (element != null && elements.outerHtml().isEmpty() == false){



                        Elements a = element.select("a");
                        String title = a.attr("title"); //列表标题
                        String href = a.attr("href"); //列表标题链接
                        String date = element.select("record span").text(); //发布时间

                        int count = JdbcUtils.selectById(href); //判断数据库中是否存在该网站记录

                        if (count == 0){

                            //创建实体类对象
                            XinXiInfoTest xin = new XinXiInfoTest();

                            String detailLink = headUrl+href;
                            xin.setId(href);
                            xin.setSourceName("xxx");
                            xin.setDetailLink(detailLink);
                            xin.setListTitle(title);
                            xin.setPageTime(date);
                            xin.setCreateTime(LocalDateTime.now());
                            xin.setCreateBy("xxx");

                            //获取详情页Entity解析的字符串对象
                            String detailPage= getDetailPageEntity(detailLink);
                            if (detailPage != null){

                                Document detailDocument = Jsoup.parse(detailPage);
                                Elements detailTitlesElements = detailDocument.select("title");
                                String detailTitle = detailTitlesElements.get(0).text();

                                //补全附件及图片链接
                                Elements detailContentElements = detailDocument.select(".content p");
                                Elements hrefElements = detailContentElements.select("a[href]");
                                String detailHref = hrefElements.attr("href");
                                Elements srcElements = detailContentElements.select("img");
                                String detailSrc = srcElements.attr("src");

                                hrefElements.attr("href",headUrl+detailHref);
                                srcElements.attr("src",headUrl+detailSrc);



                                    String html = detailContentElements.outerHtml();

                                    if (html != null){

                                        xin.setDetailTitle(detailTitle);//添加详情页标题
                                        xin.setDetailContent(html);//添加详情页内容

                                    }
                                    JdbcUtils.save(xin); //加入数据库
                            }
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


    public static String getListPageEntity(int startRecord, int endReocrd )  {
        CloseableHttpClient httpClient = null;
        HttpUriRequest post = null;
        CloseableHttpResponse response = null;
        HttpEntity entity = null;
        try {
            httpClient= HttpClients.createDefault();

            String map="col=1&webid=151&path=%2F&columnid=57686&sourceContentType=1&unitid=204748" +
                    "&webname=%25E7%2583%259F%25E5%258F%25B0%25E9%25AB%2598%25E6%2596%25B0%25E6%258A%2580%25E6%259C%25AF%25E4%25BA%25A7%25E4%25B8%259A%25E5%25BC%2580%25E5%258F%2591%25E5%258C%25BA&permissiontype=0";

            post = RequestBuilder.post("http://www.ytgxq.gov.cn/" +
                    "module/web/jpage/dataproxy.jsp?startrecord="+startRecord+"&endrecord="+endReocrd+"&perpage=25")
                    .setEntity(new StringEntity(map, ContentType.APPLICATION_FORM_URLENCODED))
                    .build();

            response = httpClient.execute(post);
            entity = response.getEntity();
            String listPage = EntityUtils.toString(entity,"utf-8");


            return listPage;
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (httpClient != null){
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return null;
    }


    public static String getDetailPageEntity(String url){
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse closeableHttpResponse = null;
        HttpGet httpGet = new HttpGet(url);

        try {
            closeableHttpResponse = httpClient.execute(httpGet);

            //判断返回页面状态码是否为200
            if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
                HttpEntity entity = closeableHttpResponse.getEntity();

                String detailPage = EntityUtils.toString(entity,"utf-8");
                return detailPage;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if ( closeableHttpResponse!= null){
                try {
                    closeableHttpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (httpClient != null){
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return null;
    }


}

Jdbc工具类

public class JdbcUtils {

//这里使用什么数据库就设置什么变量
    private static final String driver = "oracle.jdbc.driver.OracleDriver";
    private static final String url = "jdbc:oracle:thin:@192.168.2.42:1521:orcl";
    private static final String user = "xxx";
    private static final String password = "xxx";




    /**
     *
     * @param xit
     */
    public static void save(XinXiInfoTest xit){

        Connection con = null;
        PreparedStatement ps = null;
        try {
            Class.forName(driver);
            con = DriverManager.getConnection(url, user, password);

            String sql = "insert into XIN_XI_INFO_TEST (ID, SOURCE_NAME, DETAIL_LINK, DETAIL_TITLE, DETAIL_CONTENT, PAGE_TIME, CREATE_TIME, LIST_TITLE, CREATE_BY) VALUES "
                    +"(?,?,?,?,?,?,?,?,?)";

            ps = con.prepareStatement(sql);
            ps.setObject(1,xit.getId());
            ps.setObject(2,xit.getSourceName());
            ps.setObject(3,xit.getDetailLink());
            ps.setObject(4,xit.getDetailTitle());
            ps.setObject(5,xit.getDetailContent());
            ps.setObject(6,xit.getPageTime());
            ps.setObject(7, LocalDateTime.now());
            ps.setObject(8,xit.getListTitle());
            ps.setObject(9,xit.getCreateBy());

            ps.execute();

            System.out.println("添加成功");


        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            if (con != null){
                try {
                    con.close();
                } catch (SQLException throwables) {
                    throwables.printStackTrace();
                }
            }
            if (ps != null){
                try {
                    ps.close();
                } catch (SQLException throwables) {
                    throwables.printStackTrace();
                }
            }

        }
    }

    /**
     * 通过id查询数据库里是否有数据
     * @param id
     * @return
     */
    public static int selectById(String id){

        Connection con = null;
        PreparedStatement ps = null;
        try {
            Class.forName(driver);
            con = DriverManager.getConnection(url, user, password);
            String sql = "select count(*) from XIN_XI_INFO_TEST where ID = ?";
            ps = con.prepareStatement(sql);
            ps.setObject(1,id);

            ResultSet rs = ps.executeQuery();

           if (rs.next()){
               int count = rs.getInt(1);
               return count;
           }



        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            if (con != null){
                try {
                    con.close();
                } catch (SQLException throwables) {
                    throwables.printStackTrace();
                }
            }

            if (ps != null){

                try {
                    ps.close();
                } catch (SQLException throwables) {
                    throwables.printStackTrace();
                }
            }
        }


        return 0;

    }


}

Logo

DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐