第一种:利用HttpClient和Jsoup爬取网站数据
package com.company; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class Main { public static void main (String[] args) throws Exception { // 创建默认的客户端实例 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建get请求实例 HttpGet httpget = new HttpGet("http://www.pangyd.com"); System.out.println("executing request " + httpget.getURI()); String content = ""; try { // 客户端执行get请求返回响应 CloseableHttpResponse response = httpClient.execute(httpget); // 服务器响应状态行 System.out.println(response.getStatusLine().toString()); Header[] heads = response.getAllHeaders(); HttpEntity entity = response.getEntity(); if (entity != null) { content += EntityUtils.toString(entity, "utf-8"); EntityUtils.consume(entity);// 关闭内容流 } System.out.println(content); System.out.println(response.getHeaders("Content-Type")); // 打印所有响应头 for(Header h:heads){ System.out.println(h.getName()+":"+h.getValue()); } } finally { httpClient.close(); } } }
第二种:直接利用Jsoup解析网址
package com.company; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.util.HashMap; import java.util.Map; public class splider { public void jsoupText() throws Exception{ Connection con = Jsoup.connect("http://blog.pangyd.com"); Mapmap = new HashMap (); for (int i=0;i<=10;i++){ map.put(String.valueOf(i),String.valueOf(i)); } for (Map.Entry entry : map.entrySet()) { System.out.println(entry.getValue()); } Document doc = con.get(); for (Element ds:doc.select("body > div:nth-child(9) > div.page_left > ul > li")) { System.out.println(ds); } }
public static void main(String[] args) throws Exception{ splider splider1 = new splider(); splider1.jsoupText(); } }
public void jsoupText() throws Exception{ Document doc = Jsoup.connect("http://blog.pangyd.com").userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3800.0 Safari/537.36").get(); for (Element ds:doc.select("body > div:nth-child(9) > div.page_left > ul > li")) { System.out.println(ds); } }
public static String httpPost(String url, Mapmap, String cookie) throws IOException { //获取请求连接 Connection con = Jsoup.connect(url); //遍历生成参数 if(map!=null){ for (Map.Entry entry : map.entrySet()) { //添加参数 con.data(entry.getKey(), entry.getValue()); } } //插入cookie(头文件形式) con.header("Cookie", cookie); Document doc = con.post(); System.out.println(doc); return doc.toString(); }