
package com.zb.crawler.httpclient;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
public class Main {
public static void main(String[] args) {
//6种实例化HttpClient的方式
//第一种方法已过时,不建议使用
HttpClient httpClient1 = new DefaultHttpClient();
HttpClient httpClient2 = HttpClients.custom().build();
HttpClient httpClient3 = HttpClientBuilder.create().build();
HttpClient httpClient4 = HttpClients.createDefault();
HttpClient httpClient5 = HttpClients.createSystem();
HttpClient httpClient6 = HttpClients.createMinimal();
}
}package com.zb.book.httpclient;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class SetHeaderOne {
public static void main(String[] args) throws IOException {
//初始化httpClient
HttpClient httpClient = HttpClients.custom().build();
//创建get请求
HttpGet httpGet = new HttpGet("http://www.baidu.com/");
//请求头配置
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8");
httpGet.setHeader("Accept-Encoding","gzip,deflate");
httpGet.setHeader("Accept-Language","zh-CN,zh;q=0.9");
httpGet.setHeader("Cache-Control","max-age=0");
httpGet.setHeader("Host","http://www.baidu.com/");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36");
//发出Get请求
HttpResponse response = httpClient.execute(httpGet);
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
if(statusCode==200) {
//获取网页内容流
HttpEntity entity = response.getEntity();
//转换为字符串形式,需要设置编码
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(content);
//关闭内容流
EntityUtils.consume(entity);
}
}
}package com.zb.book.httpclient;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class SetHeaderList {
public static void main(String[] args) throws IOException {
//通过集合封装头信息
List<Header> headerList = new ArrayList<>();
headerList.add(new BasicHeader(HttpHeaders.ACCEPT,"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING,"gzip,deflate"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE,"zh-CN,zh;q=0.9"));
headerList.add(new BasicHeader(HttpHeaders.CACHE_CONTROL,"max-age=0"));
headerList.add(new BasicHeader(HttpHeaders.HOST,"http://www.baidu.com/"));
headerList.add(new BasicHeader(HttpHeaders.USER_AGENT,"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"));
//初始化httpClient
HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build();
//创建get请求
HttpGet httpGet = new HttpGet("http://www.baidu.com/");
//发出Get请求
HttpResponse response = httpClient.execute(httpGet);
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
if(statusCode==200) {
//获取网页内容流
HttpEntity entity = response.getEntity();
//转换为字符串形式,需要设置编码
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(content);
//关闭内容流
EntityUtils.consume(entity);
}
}
}非常简单,可参考前两种方式实现;
package com.zb.book.httpclient;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
//Post提交表单
public class PostSubmitForm {
public static void main(String[] args) throws UnsupportedEncodingException {
//使用list集合存储欲传递参数
List<NameValuePair> nvps = new ArrayList<>();
nvps.add(new BasicNameValuePair("param1","value1"));
nvps.add(new BasicNameValuePair("param2","value2"));
//创建UrlEncodedFormEntity对象
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(nvps, "UTF-8");
//创建HttpPost
HttpPost httpPost = new HttpPost("http://xxx.xxx.xxx");
httpPost.setEntity(entity);
//执行该请求即可实现提交表单
}
}使用HttpClient可设置三种超时时间:RequestTimeout(获取连接超时时间)、ConnectTimeout(建立连接超时时间)、SocketTimeout(获取数据超时时间)。配置这三种超时时间,需要用到HttpClient的RequestConfig类中的方法custom(),该方法返回值为实例化的内部类Builder(配置器),其功能是配置先关请求的字段,还可以设置代理(proxy)、Cookie规范(cookieSpec)、是否允许HTTP相关认证等;
package com.zb.book.httpclient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
//设置超时时间
public class SetTimeout {
public static void main(String[] args) {
//创建RequestConfig配置,全部设置为10秒
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(10000)//SocketTimeout(获取数据超时时间)
.setConnectTimeout(10000)//ConnectTimeout(建立连接超时时间)
.setConnectionRequestTimeout(1000)//RequestTimeout(获取连接超时时间)
.build();
//配置到httpclient
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
//后面进行正常的请求及相关处理即可
//另外可用请求方法设置配置,其他写法当做适当改变
//httpGet.setConfig(requestConfig);
}
}package com.zb.book.httpclient;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
//设置代理服务器
public class SetProxy {
public static void main(String[] args) {
//创建RequestConfig配置,全部设置为10秒
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(new HttpHost("171.221.239.11",808,null))
.build();
//配置到httpclient
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
//后面进行正常的请求及相关处理即可
//另外可用请求方法设置配置,其他写法当做适当改变
//httpGet.setConfig(requestConfig);
}
}下载HTML、图片、PDF和压缩包等文件时,一种方法是使用HttpEntity类将响应实体转化为字节数组,再利用输出流的方式写入指定文件。另一种方法是使用HttpEntity类中的writeTo(OutputStream)方法,直接将响应实体写入指定的输出流中,这种方法简单切常用,代码演示如下。
package com.zb.book.httpclient;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.FileOutputStream;
import java.io.IOException;
//下载文件
public class DownloadFile {
public static void main(String[] args) throws IOException {
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().build();
//创建HttpGet对象
HttpGet httpGet = new HttpGet("https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png");
//获取结果
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity httpEntity = response.getEntity();
//写出
httpEntity.writeTo(new FileOutputStream("C:\\Users\\ZiBo\\Desktop\\1.png"));
//消耗实体
EntityUtils.consume(httpEntity);
}
}与jsoup类似,具体做法见代码演示;
package com.httpclient.ssl;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.Arrays;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
public class SSLClient {
/**
* 基于SSL配置httpClient
* @param SSLProtocolVersion(SSL, SSLv3, TLS, TLSv1, TLSv1.1, TLSv1.2)
* @return httpClient
*/
public HttpClient initSSLClient(String SSLProtocolVersion){
RequestConfig defaultConfig = null;
PoolingHttpClientConnectionManager pcm = null;
try {
X509TrustManager xtm = new SSL509TrustManager(); //创建信任管理
//创建SSLContext对象,,并使用指定的信任管理器初始化
SSLContext context = SSLContext.getInstance(SSLProtocolVersion);
context.init(null, new X509TrustManager[]{xtm}, null);
//从SSLContext对象中得到SSLConnectionSocketFactory对象
SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(context, NoopHostnameVerifier.INSTANCE);
/*从SSLContext对象中得到SSLConnectionSocketFactory对象
*NoopHostnameVerifier.INSTANCE表示接受接受任何有效的和符合目标主机的SSL会话
*/
Registry<ConnectionSocketFactory> sfr = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", sslConnectionSocketFactory).build();
//基于配置创建连接池
pcm = new PoolingHttpClientConnectionManager(sfr);
}catch(NoSuchAlgorithmException | KeyManagementException e){
e.printStackTrace();
}
//设置全局请求配置,包括Cookie规范,HTTP认证,超时
defaultConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT)
.setExpectContinueEnabled(true)
.setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST))
.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
.setConnectionRequestTimeout(30*1000)
.setConnectTimeout(30*1000)
.setSocketTimeout(30*1000)
.build();
//初始化httpclient
HttpClient httpClient = HttpClients.custom().setConnectionManager(pcm).setDefaultRequestConfig(defaultConfig)
.build();
return httpClient;
}
//实现X509TrustManager接口
private static class SSL509TrustManager implements X509TrustManager {
//检查客户端证书
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
//do nothing 接受任意客户端证书
}
//检查服务器端证书
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
//do nothing 接受任意服务端证书
}
//返回受信任的X509证书
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
};
}package com.httpclient.ssl;
import java.io.IOException;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String[] args) throws ParseException, IOException {
String url = "https://cn.kompass.com/a/hospitality-tourism-hotel-and-catering-industries/78/";
SSLClient sslClient = new SSLClient(); //实例化
HttpClient httpClientSSL = sslClient.initSSLClient("TLS");
HttpGet httpGet = new HttpGet(url);
//获取结果
HttpResponse httpResponse = null;
try {
httpResponse = httpClientSSL.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
if(httpResponse .getStatusLine().getStatusCode() == HttpStatus.SC_OK){ //状态码200表示响应成功
//获取实体内容
String entity = EntityUtils.toString (httpResponse.getEntity(),"UTF-8");
//输出实体内容
System.out.println(entity);
EntityUtils.consume(httpResponse.getEntity()); //消耗实体
}else {
//关闭HttpEntity的流实体
EntityUtils.consume(httpResponse.getEntity()); //消耗实体
}
}
}使用HtpClient请求URL时,有时会出现请求异常的情况。针对一些非致命的异常,可以通过请求重试解决。HttpClient提供了默认重试策略DefalutHttpRequestRetryHandler。DefalutHttpRequestRetryHandler类实现了HttpRequestRetryHandler接口,重写了retryRequest(方法。由源码可以发现DefalutHttpRequestRetryHandler类定义的默认重试次数为3次;幂等方法(如GET和HEAD是幂等的)可以重试:如果网页请求失败,可以重试。另外,针对4种异常不进行重试,这四种异常分别是InterruptedIOException (线程中断异常)、UnknownHostException (未知的Host异常)、ConnectException (连接异常,如连接拒绝异常)和SSLException ( HTTPS请求认证异常)。
package com.zb.book.httpclient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
//设置请求重试
public class SetRequestRetry {
public static void main(String[] args) {
//第一种:默认重试3次
HttpClients.custom()
.setRetryHandler(new DefaultHttpRequestRetryHandler())
.build();
//第二种:自定义重试5次
HttpClients.custom()
.setRetryHandler(new DefaultHttpRequestRetryHandler(5,true))
.build();
}
}值得注意的是,在进行数据爬取时经常遇到的两种超时时间: ConnectTimeout(建立连接的超时时间)和SocketTimeout(获取数据的超时时间),这两种超时时间对应的异常( ConnectTimeoutException与SocketTimeoutException )都继承自InterruptedIOException类,即属于线程中断异常,不会进行重试。
(可参考原始学习笔记的连接池)
package com.httpclient.thread;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.Consts;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.SocketConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String[] args) throws FileNotFoundException {
//添加连接参数
ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setMalformedInputAction(CodingErrorAction.IGNORE)
.setUnmappableInputAction(CodingErrorAction.IGNORE)
.setCharset(Consts.UTF_8)
.build();
//添加socket参数
SocketConfig socketConfig = SocketConfig.custom()
.setTcpNoDelay(true)
.build();
//配置连接池管理器
PoolingHttpClientConnectionManager pcm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
pcm.setMaxTotal(100);
// 设置每个连接的路由数
pcm.setDefaultMaxPerRoute(10);
//设置连接信息
pcm.setDefaultConnectionConfig(connectionConfig);
//设置socket信息
pcm.setDefaultSocketConfig(socketConfig);
//设置全局请求配置,包括Cookie规范,HTTP认证,超时
RequestConfig defaultConfig = RequestConfig.custom()
.setCookieSpec(CookieSpecs.STANDARD_STRICT)
.setExpectContinueEnabled(true)
.setTargetPreferredAuthSchemes(Arrays
.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST))
.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
.setConnectionRequestTimeout(30*1000)
.setConnectTimeout(30*1000)
.setSocketTimeout(30*1000)
.build();
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(pcm)
.setDefaultRequestConfig(defaultConfig)
.build();
// 请求的URL
String[] urlArr = {
"https://hbr.org/podcasts",
"https://hbr.org/magazine",
"https://hbr.org/most-popular",
"https://hbr.org/big-ideas",
"https://hbr.org/reading-lists"
};
//创建固定大小的线程池
ExecutorService exec = Executors.newFixedThreadPool(3);
for(int i = 0; i< urlArr.length;i++){
String filename = urlArr[i].split("org/")[1]; //HTML需要输出的文件名
//创建HTML文件输出目录
OutputStream out = new FileOutputStream("file/" + filename);
HttpGet httpget = new HttpGet(urlArr[i]);
//启动线程执行请求
exec.execute(new DownHtmlFileThread(httpClient, httpget, out));
}
//关闭线程
exec.shutdown();
}
static class DownHtmlFileThread extends Thread {
private final CloseableHttpClient httpClient;
private final HttpContext context;
private final HttpGet httpget;
private final OutputStream out;
//输入的参数
public DownHtmlFileThread(CloseableHttpClient httpClient,
HttpGet httpget, OutputStream out) {
this.httpClient = httpClient;
this.context = HttpClientContext.create();
this.httpget = httpget;
this.out = out;
}
@Override
public void run() {
System.out.println(Thread.currentThread().getName() +
"线程请求的URL为:" + httpget.getURI());
try {
CloseableHttpResponse response = httpClient.execute(
httpget, context); //执行请求
try {
//HTML文件写入文档
out.write(EntityUtils.toString(response.getEntity(),"gbk")
.getBytes());
out.close();
//消耗实体
EntityUtils.consume(response.getEntity());
} finally{
response.close(); //关闭响应
}
} catch (ClientProtocolException ex) {
ex.printStackTrace(); // 处理 protocol错误
} catch (IOException ex) {
ex.printStackTrace(); // 处理I/O错误
}
}
}
}