【Java爬虫】005
【Java爬虫】005
最后更新时间:2020年8月1日11:42:7
package com.httpclient;
import org.apache.HttpClient;
import org.apache.http.DefaultHttpClient;
import org.apache.http.HttpClientBuilder;
import org.apache.http.HttpClients;
public class Main {
public static void main(String[] args) {
//6种实例化HttpClient的方式
//第一种方法已过时,不建议使用
HttpClient httpClient1 = new DefaultHttpClient();
HttpClient httpClient2 = ().build();
HttpClient httpClient = ().build();
HttpClient httpClient4 = ();
HttpClient httpClient5 = ();
HttpClient httpClient6 = ();
}
}
1、方式一:一个一个设置
代码语言:javascript代码运行次数:0运行复制package com.zb.book.httpclient;
import org.apache.http.HttpEntity;
import org.apache.http.HttpRespe;
import org.apache.HttpClient;
import org.apache.methods.HttpGet;
import org.apache.http.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class SetHeaderOne {
public static void main(String[] args) throws IOException {
//初始化httpClient
HttpClient httpClient = ().build();
//创建get请求
HttpGet httpGet = new HttpGet("/");
//请求头配置
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8");
httpGet.setHeader("Accept-Encoding","gzip,deflate");
httpGet.setHeader("Accept-Language","zh-C,zh;q=0.9");
httpGet.setHeader("Cache-Control","max-age=0");
httpGet.setHeader("Host","/");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows T 10.0; WOW64) AppleWebKit/57.6 (KHTML, like Gecko) Chrome/78.0.904.108 Safari/57.6");
//发出Get请求
HttpRespe respe = (httpGet);
//获取响应状态码
int statusCode = respe.getStatusLine().getStatusCode();
if(statusCode==200) {
//获取网页内容流
HttpEntity entity = respe.getEntity();
//转换为字符串形式,需要设置编码
String content = (respe.getEntity(), "UTF-8");
println(content);
//关闭内容流
(entity);
}
}
}
2、方式二:封装到list集合统一设置
代码语言:javascript代码运行次数:0运行复制package com.zb.book.httpclient;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpRespe;
import org.apache.HttpClient;
import org.apache.methods.HttpGet;
import org.apache.http.HttpClients;
import org.apache.BasicHeader;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class SetHeaderList {
public static void main(String[] args) throws IOException {
//通过集合封装头信息
List<Header> headerList = new ArrayList<>();
headerList.add(new BasicHeader(HttpHeaders.ACCEPT,"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ECODIG,"gzip,deflate"));
headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LAGUAGE,"zh-C,zh;q=0.9"));
headerList.add(new BasicHeader(HttpHeaders.CACHE_COTROL,"max-age=0"));
headerList.add(new BasicHeader(HttpHeaders.HOST,"/"));
headerList.add(new BasicHeader(HttpHeaders.USER_AGET,"Mozilla/5.0 (Windows T 10.0; WOW64) AppleWebKit/57.6 (KHTML, like Gecko) Chrome/78.0.904.108 Safari/57.6"));
//初始化httpClient
HttpClient httpClient = ().setDefaultHeaders(headerList).build();
//创建get请求
HttpGet httpGet = new HttpGet("/");
//发出Get请求
HttpRespe respe = (httpGet);
//获取响应状态码
int statusCode = respe.getStatusLine().getStatusCode();
if(statusCode==200) {
//获取网页内容流
HttpEntity entity = respe.getEntity();
//转换为字符串形式,需要设置编码
String content = (respe.getEntity(), "UTF-8");
println(content);
//关闭内容流
(entity);
}
}
}
、其他方式
非常简单,可参考前两种方式实现;
package com.zb.book.httpclient;
import org.apache.http.ameValuePair;
import org.apache.entity.UrlEncodedFormEntity;
import org.apache.methods.HttpPost;
import org.apache.BasicameValuePair;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
//Post提交表单
public class PostSubmitForm {
public static void main(String[] args) throws UnsupportedEncodingException {
//使用list集合存储欲传递参数
List<ameValuePair> nvps = new ArrayList<>();
nvps.add(new BasicameValuePair("param1","value1"));
nvps.add(new BasicameValuePair("param2","value2"));
//创建UrlEncodedFormEntity对象
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(nvps, "UTF-8");
//创建HttpPost
HttpPost httpPost = new HttpPost(";);
httpPost.setEntity(entity);
//执行该请求即可实现提交表单
}
}
1、概述
使用HttpClient可设置三种超时时间:RequestTimeout(获取连接超时时间)、ConnectTimeout(建立连接超时时间)、SocketTimeout(获取数据超时时间)。配置这三种超时时间,需要用到HttpClient的RequestConfig类中的方法custom(),该方法返回值为实例化的内部类Builder(配置器),其功能是配置先关请求的字段,还可以设置代理(proxy)、Cookie规范(cookieSpec)、是否允许HTTP相关认证等;
2、代码演示
代码语言:javascript代码运行次数:0运行复制package com.zb.book.httpclient;
import org.apache.config.RequestConfig;
import org.apache.http.CloseableHttpClient;
import org.apache.http.HttpClients;
//设置超时时间
public class SetTimeout {
public static void main(String[] args) {
//创建RequestConfig配置,全部设置为10秒
RequestConfig requestConfig = ()
.setSocketTimeout(10000)//SocketTimeout(获取数据超时时间)
.setConnectTimeout(10000)//ConnectTimeout(建立连接超时时间)
.setConnectionRequestTimeout(1000)//RequestTimeout(获取连接超时时间)
.build();
//配置到httpclient
CloseableHttpClient httpClient = ().setDefaultRequestConfig(requestConfig).build();
//后面进行正常的请求及相关处理即可
//另外可用请求方法设置配置,其他写法当做适当改变
//httpGet.setConfig(requestConfig);
}
}
package com.zb.book.httpclient;
import org.apache.http.HttpHost;
import org.apache.config.RequestConfig;
import org.apache.http.CloseableHttpClient;
import org.apache.http.HttpClients;
//设置代理服务器
public class SetProxy {
public static void main(String[] args) {
//创建RequestConfig配置,全部设置为10秒
RequestConfig requestConfig = ()
.setProxy(new HttpHost("171.221.29.11",808,null))
.build();
//配置到httpclient
CloseableHttpClient httpClient = ().setDefaultRequestConfig(requestConfig).build();
//后面进行正常的请求及相关处理即可
//另外可用请求方法设置配置,其他写法当做适当改变
//httpGet.setConfig(requestConfig);
}
}
1、概述
下载HTML、图片、PDF和压缩包等文件时,一种方法是使用HttpEntity类将响应实体转化为字节数组,再利用输出流的方式写入指定文件。另一种方法是使用HttpEntity类中的writeTo(OutputStream)方法,直接将响应实体写入指定的输出流中,这种方法简单切常用,代码演示如下。
2、代码演示
代码语言:javascript代码运行次数:0运行复制package com.zb.book.httpclient;
import org.apache.http.HttpEntity;
import org.apache.methods.CloseableHttpRespe;
import org.apache.methods.HttpGet;
import org.apache.http.CloseableHttpClient;
import org.apache.http.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.FileOutputStream;
import java.io.IOException;
//下载文件
public class DownloadFile {
public static void main(String[] args) throws IOException {
//创建HttpClient对象
CloseableHttpClient httpClient = ().build();
//创建HttpGet对象
HttpGet httpGet = new HttpGet(".png");
//获取结果
CloseableHttpRespe respe = (httpGet);
HttpEntity httpEntity = respe.getEntity();
//写出
httpEntity.writeTo(new FileOutputStream("C:\\Users\\ZiBo\\Desktop\\1.png"));
//消耗实体
(httpEntity);
}
}
1、概述
与jsoup类似,具体做法见代码演示;
2、代码演示
SSLClient类:
代码语言:javascript代码运行次数:0运行复制package com.httpclient.ssl;
import java.security.KeyManagementException;
import java.;
import java.X509Certificate;
import java.util.Arrays;
import javax.ssl.SSLContext;
import javax.ssl.X509TrustManager;
import org.apache.HttpClient;
import org.apache.config.AuthSchemes;
import org.apache.config.CookieSpecs;
import org.apache.config.RequestConfig;
import org.apache.Registry;
import org.apache.RegistryBuilder;
import org.apache.socket.ConnectionSocketFactory;
import org.apache.socket.PlainConnectionSocketFactory;
import org.apache.;
import org.apache.ssl.SSLConnectionSocketFactory;
import org.apache.http.HttpClients;
import org.apache.http.PoolingHttpClientConnectionManager;
public class SSLClient {
/**
* 基于SSL配置httpClient
* @param SSLProtocolVersion(SSL, SSLv, TLS, TLSv1, TLSv1.1, TLSv1.2)
* @return httpClient
*/
public HttpClient initSSLClient(String SSLProtocolVersion){
RequestConfig defaultConfig = null;
PoolingHttpClientConnectionManager pcm = null;
try {
X509TrustManager xtm = new SSL509TrustManager(); //创建信任管理
//创建SSLContext对象,,并使用指定的信任管理器初始化
SSLContext context = SSLContext.getInstance(SSLProtocolVersion);
context.init(null, new X509TrustManager[]{xtm}, null);
//从SSLContext对象中得到SSLConnectionSocketFactory对象
SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(context, oopHostnameVerifier.ISTACE);
/*从SSLContext对象中得到SSLConnectionSocketFactory对象
*oopHostnameVerifier.ISTACE表示接受接受任何有效的和符合目标主机的SSL会话
*/
Registry<ConnectionSocketFactory> sfr = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.ISTACE)
.register("https", sslConnectionSocketFactory).build();
//基于配置创建连接池
pcm = new PoolingHttpClientConnectionManager(sfr);
}catch(oSuchAlgorithmException | KeyManagementException e){
e.printStackTrace();
}
//设置全局请求配置,包括Cookie规范,HTTP认证,超时
defaultConfig = ().setCookieSpec(CookieSpecs.STADARD_STRICT)
.setExpectContinueEnabled(true)
.setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.TLM, AuthSchemes.DIGEST))
.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
.setConnectionRequestTimeout(0*1000)
.setConnectTimeout(0*1000)
.setSocketTimeout(0*1000)
.build();
//初始化httpclient
HttpClient httpClient = ().setConnectionManager(pcm).setDefaultRequestConfig(defaultConfig)
.build();
return httpClient;
}
//实现X509TrustManager接口
private static class SSL509TrustManager implements X509TrustManager {
//检查客户端证书
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
//do nothing 接受任意客户端证书
}
//检查服务器端证书
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
//do nothing 接受任意服务端证书
}
//返回受信任的X509证书
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
};
}
test类:
代码语言:javascript代码运行次数:0运行复制package com.httpclient.ssl;
import java.io.IOException;
import org.apache.http.HttpRespe;
import org.apache.http.HttpStatus;
import org.apache.http.ParseException;
import org.apache.HttpClient;
import org.apache.methods.HttpGet;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String[] args) throws ParseException, IOException {
String url = "/";
SSLClient sslClient = new SSLClient(); //实例化
HttpClient httpClientSSL = sslClient.initSSLClient("TLS");
HttpGet httpGet = new HttpGet(url);
//获取结果
HttpRespe httpRespe = null;
try {
httpRespe = (httpGet);
} catch (IOException e) {
e.printStackTrace();
}
if(httpRespe .getStatusLine().getStatusCode() == HttpStatus.SC_OK){ //状态码200表示响应成功
//获取实体内容
String entity = (httpRespe.getEntity(),"UTF-8");
//输出实体内容
println(entity);
(httpRespe.getEntity()); //消耗实体
}else {
//关闭HttpEntity的流实体
(httpRespe.getEntity()); //消耗实体
}
}
}
1、概述
使用HtpClient请求URL时,有时会出现请求异常的情况。针对一些非致命的异常,可以通过请求重试解决。HttpClient提供了默认重试策略DefalutHttpRequestRetryHandler。DefalutHttpRequestRetryHandler类实现了HttpRequestRetryHandler接口,重写了retryRequest(方法。由源码可以发现DefalutHttpRequestRetryHandler类定义的默认重试次数为次;幂等方法(如GET和HEAD是幂等的)可以重试:如果网页请求失败,可以重试。另外,针对4种异常不进行重试,这四种异常分别是InterruptedIOException (线程中断异常)、UnknownHostException (未知的Host异常)、ConnectException (连接异常,如连接拒绝异常)和SSLException ( HTTPS请求认证异常)。
2、代码演示
代码语言:javascript代码运行次数:0运行复制package com.zb.book.httpclient;
import org.apache.http.DefaultHttpRequestRetryHandler;
import org.apache.http.HttpClients;
//设置请求重试
public class SetRequestRetry {
public static void main(String[] args) {
//第一种:默认重试次
()
.setRetryHandler(new DefaultHttpRequestRetryHandler())
.build();
//第二种:自定义重试5次
()
.setRetryHandler(new DefaultHttpRequestRetryHandler(5,true))
.build();
}
}
、补充说明
值得注意的是,在进行数据爬取时经常遇到的两种超时时间: ConnectTimeout(建立连接的超时时间)和SocketTimeout(获取数据的超时时间),这两种超时时间对应的异常( ConnectTimeoutException与SocketTimeoutException )都继承自InterruptedIOException类,即属于线程中断异常,不会进行重试。
(可参考原始学习笔记的连接池)
代码语言:javascript代码运行次数:0运行复制package com.;
import java.io.FileotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import CodingErrorAction;
import java.util.Arrays;
import java.ExecutorService;
import java.Executors;
import org.apache.http.Cts;
import org.apache.ClientProtocolException;
import org.apache.config.AuthSchemes;
import org.apache.config.CookieSpecs;
import org.apache.config.RequestConfig;
import org.apache.methods.CloseableHttpRespe;
import org.apache.methods.HttpGet;
import org.apache.protocol.HttpClientContext;
import org.apache.ConnectionConfig;
import org.apache.SocketConfig;
import org.apache.http.CloseableHttpClient;
import org.apache.http.HttpClients;
import org.apache.http.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String[] args) throws FileotFoundException {
//添加连接参数
ConnectionConfig connectionConfig = ()
.setMalformedInputAction(CodingErrorAction.IGORE)
.setUnmappableInputAction(CodingErrorAction.IGORE)
.setCharset(Cts.UTF_8)
.build();
//添加socket参数
SocketConfig socketConfig = ()
.setTcpoDelay(true)
.build();
//配置连接池管理器
PoolingHttpClientConnectionManager pcm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
pcm.setMaxTotal(100);
// 设置每个连接的路由数
pcm.setDefaultMaxPerRoute(10);
//设置连接信息
pcm.setDefaultConnectionConfig(connectionConfig);
//设置socket信息
pcm.setDefaultSocketConfig(socketConfig);
//设置全局请求配置,包括Cookie规范,HTTP认证,超时
RequestConfig defaultConfig = ()
.setCookieSpec(CookieSpecs.STADARD_STRICT)
.setExpectContinueEnabled(true)
.setTargetPreferredAuthSchemes(Arrays
.asList(AuthSchemes.TLM, AuthSchemes.DIGEST))
.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
.setConnectionRequestTimeout(0*1000)
.setConnectTimeout(0*1000)
.setSocketTimeout(0*1000)
.build();
CloseableHttpClient httpClient = ()
.setConnectionManager(pcm)
.setDefaultRequestConfig(defaultConfig)
.build();
// 请求的URL
String[] urlArr = {
";,
";,
";,
";,
";
};
//创建固定大小的线程池
ExecutorService exec = ();
for(int i = 0; i< urlArr.length;i++){
String filename = urlArr[i].split("org/")[1]; //HTML需要输出的文件名
//创建HTML文件输出目录
OutputStream out = new FileOutputStream("file/" + filename);
HttpGet httpget = new HttpGet(urlArr[i]);
//启动线程执行请求
(new DownHtmlFileThread(httpClient, httpget, out));
}
//关闭线程
exec.shutdown();
}
static class DownHtmlFileThread extends Thread {
private final CloseableHttpClient httpClient;
private final HttpContext context;
private final HttpGet httpget;
private final OutputStream out;
//输入的参数
public DownHtmlFileThread(CloseableHttpClient httpClient,
HttpGet httpget, OutputStream out) {
this.httpClient = httpClient;
= ();
this.httpget = httpget;
= out;
}
@Override
public void run() {
println(().getame() +
"线程请求的URL为:" + httpget.getURI());
try {
CloseableHttpRespe respe = (
httpget, context); //执行请求
try {
//HTML文件写入文档
out.write((respe.getEntity(),"gbk")
.getBytes());
();
//消耗实体
(respe.getEntity());
} finally{
(); //关闭响应
}
} catch (ClientProtocolException ex) {
ex.printStackTrace(); // 处理 protocol错误
} catch (IOException ex) {
ex.printStackTrace(); // 处理I/O错误
}
}
}
}
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。 原始发表:2025-01-06,如有侵权请联系 cloudcommunity@tencent 删除学习笔记java爬虫httpclient配置 #感谢您对电脑配置推荐网 - 最新i3 i5 i7组装电脑配置单推荐报价格的认可,转载请说明来源于"电脑配置推荐网 - 最新i3 i5 i7组装电脑配置单推荐报价格
上一篇:【Java爬虫】006
下一篇:【Java爬虫】004
推荐阅读
留言与评论(共有 13 条评论) |
本站网友 51号星球下载 | 6分钟前 发表 |
直接将响应实体写入指定的输出流中 | |
本站网友 黄曲霉毒素检测方法 | 5分钟前 发表 |
image/webp | |
本站网友 yankuang | 27分钟前 发表 |
SocketTimeout(获取数据超时时间) | |
本站网友 最大二维码 | 15分钟前 发表 |
String s) { //do nothing 接受任意客户端证书 } //检查服务器端证书 public void checkServerTrusted(X509Certificate[] x509Certificates | |
本站网友 退房申请 | 12分钟前 发表 |
另外 | |
本站网友 未来房价走势 | 1分钟前 发表 |
另一种方法是使用HttpEntity类中的writeTo(OutputStream)方法 | |
本站网友 乌海个人二手房网 | 3分钟前 发表 |
like Gecko) Chrome/78.0.904.108 Safari/57.6"); //发出Get请求 HttpRespe respe = (httpGet); //获取响应状态码 int statusCode = respe.getStatusLine().getStatusCode(); if(statusCode==200) { //获取网页内容流 HttpEntity entity = respe.getEntity(); //转换为字符串形式 | |
本站网友 重庆阳光100 | 26分钟前 发表 |
九 | |
本站网友 怀孕一个月胎儿图 | 30分钟前 发表 |
"zh-C | |
本站网友 北京机动车摇号 | 10分钟前 发表 |
"max-age=0")); headerList.add(new BasicHeader(HttpHeaders.HOST | |
本站网友 老年人补品 | 20分钟前 发表 |
另外 | |
本站网友 医疗事故案例 | 13分钟前 发表 |
ConnectException (连接异常 |