HttpClient爬取网页信息并解析加入数据库
·
public class HttpClient {
public static int startRecord = 1;
public static int endRecord = 75;
public static int maxPage = 0;
public static String headUrl = "http://www.ytgxq.gov.cn";
public static void main(String[] args) {
try {
while (maxPage == 0 || endRecord <= maxPage){
saveMessage();
startRecord+=75;
endRecord+=75;
}
maxPage = 0;
}catch (Exception e){
e.printStackTrace();
}
}
public synchronized static void saveMessage(){
try {
//获取列表页的Entity解析的字符串对象
String listPage = getListPageEntity(startRecord, endRecord);
if (listPage != null){
Document listDocument = Jsoup.parse(listPage.replace("<![CDATA[","").replace("]]>",""));
if (maxPage == 0){
String totalrecord = listDocument.select("totalrecord").text();
maxPage = Integer.parseInt(totalrecord);
}
Elements elements = listDocument.select("record");
for (Element element : elements) {
if (element != null && elements.outerHtml().isEmpty() == false){
Elements a = element.select("a");
String title = a.attr("title"); //列表标题
String href = a.attr("href"); //列表标题链接
String date = element.select("record span").text(); //发布时间
int count = JdbcUtils.selectById(href); //判断数据库中是否存在该网站记录
if (count == 0){
//创建实体类对象
XinXiInfoTest xin = new XinXiInfoTest();
String detailLink = headUrl+href;
xin.setId(href);
xin.setSourceName("xxx");
xin.setDetailLink(detailLink);
xin.setListTitle(title);
xin.setPageTime(date);
xin.setCreateTime(LocalDateTime.now());
xin.setCreateBy("xxx");
//获取详情页Entity解析的字符串对象
String detailPage= getDetailPageEntity(detailLink);
if (detailPage != null){
Document detailDocument = Jsoup.parse(detailPage);
Elements detailTitlesElements = detailDocument.select("title");
String detailTitle = detailTitlesElements.get(0).text();
//补全附件及图片链接
Elements detailContentElements = detailDocument.select(".content p");
Elements hrefElements = detailContentElements.select("a[href]");
String detailHref = hrefElements.attr("href");
Elements srcElements = detailContentElements.select("img");
String detailSrc = srcElements.attr("src");
hrefElements.attr("href",headUrl+detailHref);
srcElements.attr("src",headUrl+detailSrc);
String html = detailContentElements.outerHtml();
if (html != null){
xin.setDetailTitle(detailTitle);//添加详情页标题
xin.setDetailContent(html);//添加详情页内容
}
JdbcUtils.save(xin); //加入数据库
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getListPageEntity(int startRecord, int endReocrd ) {
CloseableHttpClient httpClient = null;
HttpUriRequest post = null;
CloseableHttpResponse response = null;
HttpEntity entity = null;
try {
httpClient= HttpClients.createDefault();
String map="col=1&webid=151&path=%2F&columnid=57686&sourceContentType=1&unitid=204748" +
"&webname=%25E7%2583%259F%25E5%258F%25B0%25E9%25AB%2598%25E6%2596%25B0%25E6%258A%2580%25E6%259C%25AF%25E4%25BA%25A7%25E4%25B8%259A%25E5%25BC%2580%25E5%258F%2591%25E5%258C%25BA&permissiontype=0";
post = RequestBuilder.post("http://www.ytgxq.gov.cn/" +
"module/web/jpage/dataproxy.jsp?startrecord="+startRecord+"&endrecord="+endReocrd+"&perpage=25")
.setEntity(new StringEntity(map, ContentType.APPLICATION_FORM_URLENCODED))
.build();
response = httpClient.execute(post);
entity = response.getEntity();
String listPage = EntityUtils.toString(entity,"utf-8");
return listPage;
} catch (IOException e) {
e.printStackTrace();
}finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
public static String getDetailPageEntity(String url){
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse closeableHttpResponse = null;
HttpGet httpGet = new HttpGet(url);
try {
closeableHttpResponse = httpClient.execute(httpGet);
//判断返回页面状态码是否为200
if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
HttpEntity entity = closeableHttpResponse.getEntity();
String detailPage = EntityUtils.toString(entity,"utf-8");
return detailPage;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if ( closeableHttpResponse!= null){
try {
closeableHttpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
}
Jdbc工具类
public class JdbcUtils {
//这里使用什么数据库就设置什么变量
private static final String driver = "oracle.jdbc.driver.OracleDriver";
private static final String url = "jdbc:oracle:thin:@192.168.2.42:1521:orcl";
private static final String user = "xxx";
private static final String password = "xxx";
/**
*
* @param xit
*/
public static void save(XinXiInfoTest xit){
Connection con = null;
PreparedStatement ps = null;
try {
Class.forName(driver);
con = DriverManager.getConnection(url, user, password);
String sql = "insert into XIN_XI_INFO_TEST (ID, SOURCE_NAME, DETAIL_LINK, DETAIL_TITLE, DETAIL_CONTENT, PAGE_TIME, CREATE_TIME, LIST_TITLE, CREATE_BY) VALUES "
+"(?,?,?,?,?,?,?,?,?)";
ps = con.prepareStatement(sql);
ps.setObject(1,xit.getId());
ps.setObject(2,xit.getSourceName());
ps.setObject(3,xit.getDetailLink());
ps.setObject(4,xit.getDetailTitle());
ps.setObject(5,xit.getDetailContent());
ps.setObject(6,xit.getPageTime());
ps.setObject(7, LocalDateTime.now());
ps.setObject(8,xit.getListTitle());
ps.setObject(9,xit.getCreateBy());
ps.execute();
System.out.println("添加成功");
} catch (Exception e) {
e.printStackTrace();
}finally {
if (con != null){
try {
con.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
if (ps != null){
try {
ps.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
}
}
/**
* 通过id查询数据库里是否有数据
* @param id
* @return
*/
public static int selectById(String id){
Connection con = null;
PreparedStatement ps = null;
try {
Class.forName(driver);
con = DriverManager.getConnection(url, user, password);
String sql = "select count(*) from XIN_XI_INFO_TEST where ID = ?";
ps = con.prepareStatement(sql);
ps.setObject(1,id);
ResultSet rs = ps.executeQuery();
if (rs.next()){
int count = rs.getInt(1);
return count;
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if (con != null){
try {
con.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
if (ps != null){
try {
ps.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
}
return 0;
}
}
DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。
更多推荐


所有评论(0)