庞玉栋

Java两种方法实现简单爬虫

发布时间:9个月前热度: 497 ℃评论数:

第一种:利用HttpClient和Jsoup爬取网站数据

package com.company;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;


public class Main {

    public static void main (String[] args) throws Exception {
        // 创建默认的客户端实例
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 创建get请求实例
        HttpGet httpget = new HttpGet("http://www.pangyd.com");

        System.out.println("executing request " + httpget.getURI());
        String content = "";

        try {
            // 客户端执行get请求返回响应
            CloseableHttpResponse response = httpClient.execute(httpget);

            // 服务器响应状态行
            System.out.println(response.getStatusLine().toString());

            Header[] heads = response.getAllHeaders();
            HttpEntity entity = response.getEntity();

            if (entity != null) {
                content += EntityUtils.toString(entity, "utf-8");
                EntityUtils.consume(entity);// 关闭内容流
            }

            System.out.println(content);
            System.out.println(response.getHeaders("Content-Type"));
            // 打印所有响应头

            for(Header h:heads){
                System.out.println(h.getName()+":"+h.getValue());
            }
        } finally {
            httpClient.close();
        }
    }
}



第二种:直接利用Jsoup解析网址

package com.company;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.util.HashMap;
import java.util.Map;

public class splider {

    public void jsoupText() throws Exception{
        Connection con = Jsoup.connect("http://blog.pangyd.com");
        Map map = new HashMap();
        for (int i=0;i<=10;i++){
            map.put(String.valueOf(i),String.valueOf(i));
        }
        for (Map.Entry entry : map.entrySet()) {
            System.out.println(entry.getValue());
        }
        Document doc = con.get();
        for (Element ds:doc.select("body > div:nth-child(9) > div.page_left > ul > li")) {
            System.out.println(ds);
        }
    }


public static void main(String[] args) throws Exception{ splider splider1 = new splider(); splider1.jsoupText(); } }

    public void jsoupText() throws Exception{
        Document doc = Jsoup.connect("http://blog.pangyd.com").userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3800.0 Safari/537.36").get();
        for (Element ds:doc.select("body > div:nth-child(9) > div.page_left > ul > li")) {
            System.out.println(ds);
        }
    }
    public static String httpPost(String url, Map map, String cookie) throws IOException {
        //获取请求连接
        Connection con = Jsoup.connect(url);
        //遍历生成参数
        if(map!=null){
            for (Map.Entry entry : map.entrySet()) {
                //添加参数
                con.data(entry.getKey(), entry.getValue());
            }
        }
        //插入cookie(头文件形式)
        con.header("Cookie", cookie);
        Document doc = con.post();
        System.out.println(doc);
        return doc.toString();
    }

简单

手机扫码访问