设为首页收藏本站language→→ 语言切换

鸿鹄论坛

 找回密码
 论坛注册

QQ登录

先注册再绑定QQ

查看: 157|回复: 1
收起左侧

java爬虫之HtmlUnit介绍

[复制链接]
发表于 2022-10-20 16:39:01 | 显示全部楼层 |阅读模式
前端有时候会遇到项目临时需要网上收集数据的情况,什么方案是简单易懂、长期可用的呢,当然是用浏览器终端测试单元做爬虫是最方便的啦,将平时工作中的测试程序进行简单的修改,然后配合爬虫代理,就可以马上开始数据采集,是不是很方便呀。
刚好之前也分享了一篇关于java爬虫的文章,那今天也是爬虫方面的知识,我们可以继续分享下java爬虫。不知道做学java的对HtmlUnit熟悉不呢?它是是java下的一款无头浏览器方案,通过相应的API模拟HTML协议,可以请求页面,提交表单,打开链接等等操作,完全模拟用户终端。支持复杂的JavaScript、AJAX库,可以模拟多种浏览器,包括Chrome,Firefox或IE等。
下面提供一个简单的demo,通过调用爬虫代理访问IP查询网站,如果将目标网站修改为需要采集的数据链接,即可获取相应的数据,再加上数据分析模块就可以基本使用,示例是根据实际项目需求写的,看下要复杂些:
  1. import java.io.BufferedReader;
  2. import java.io.InputStreamReader;
  3. import java.io.IOException;
  4. import java.net.URI;
  5. import java.util.Arrays;
  6. import java.util.ArrayList;
  7. import java.util.HashSet;
  8. import java.util.List;
  9. import java.util.Set;

  10. import org.apache.http.Header;
  11. import org.apache.http.HeaderElement;
  12. import org.apache.http.HttpHost;
  13. import org.apache.http.auth.AuthScope;
  14. import org.apache.http.auth.UsernamePasswordCredentials;
  15. import org.apache.http.client.AuthCache;
  16. import org.apache.http.client.CredentialsProvider;
  17. import org.apache.http.client.HttpRequestRetryHandler;
  18. import org.apache.http.client.config.RequestConfig;
  19. import org.apache.http.client.config.AuthSchemes;
  20. import org.apache.http.client.entity.GzipDecompressingEntity;
  21. import org.apache.http.client.entity.UrlEncodedFormEntity;
  22. import org.apache.http.client.methods.CloseableHttpResponse;
  23. import org.apache.http.client.methods.HttpGet;
  24. import org.apache.http.client.methods.HttpPost;
  25. import org.apache.http.client.methods.HttpRequestBase;
  26. import org.apache.http.client.protocol.HttpClientContext;
  27. import org.apache.http.config.Registry;
  28. import org.apache.http.config.RegistryBuilder;
  29. import org.apache.http.conn.socket.ConnectionSocketFactory;
  30. import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
  31. import org.apache.http.conn.socket.PlainConnectionSocketFactory;
  32. import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
  33. import org.apache.http.impl.auth.BasicScheme;
  34. import org.apache.http.impl.client.BasicAuthCache;
  35. import org.apache.http.impl.client.BasicCredentialsProvider;
  36. import org.apache.http.impl.client.CloseableHttpClient;
  37. import org.apache.http.impl.client.HttpClients;
  38. import org.apache.http.impl.client.ProxyAuthenticationStrategy;
  39. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
  40. import org.apache.http.message.BasicHeader;
  41. import org.apache.http.message.BasicNameValuePair;
  42. import org.apache.http.NameValuePair;
  43. import org.apache.http.util.EntityUtils;

  44. public class Demo
  45. {
  46.     // 代理服务器(产品官网 www.16yun.cn)
  47.     final static String proxyHost = "t.16yun.cn";
  48.     final static Integer proxyPort = 31000;

  49.     // 代理验证信息
  50.     final static String proxyUser = "username";
  51.     final static String proxyPass = "password";




  52.     private static PoolingHttpClientConnectionManager cm = null;
  53.     private static HttpRequestRetryHandler httpRequestRetryHandler = null;
  54.     private static HttpHost proxy = null;

  55.     private static CredentialsProvider credsProvider = null;
  56.     private static RequestConfig reqConfig = null;

  57.     static {
  58.         ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
  59.         LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

  60.         Registry registry = RegistryBuilder.create()
  61.             .register("http", plainsf)
  62.             .register("https", sslsf)
  63.             .build();

  64.         cm = new PoolingHttpClientConnectionManager(registry);
  65.         cm.setMaxTotal(20);
  66.         cm.setDefaultMaxPerRoute(5);

  67.         proxy = new HttpHost(proxyHost, proxyPort, "http");

  68.         credsProvider = new BasicCredentialsProvider();
  69.         credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

  70.         reqConfig = RequestConfig.custom()
  71.             .setConnectionRequestTimeout(5000)
  72.             .setConnectTimeout(5000)
  73.             .setSocketTimeout(5000)
  74.             .setExpectContinueEnabled(false)
  75.             .setProxy(new HttpHost(proxyHost, proxyPort))
  76.             .build();
  77.     }

  78.     public static void doRequest(HttpRequestBase httpReq) {
  79.         CloseableHttpResponse httpResp = null;

  80.         try {
  81.             setHeaders(httpReq);

  82.             httpReq.setConfig(reqConfig);

  83.             CloseableHttpClient httpClient = HttpClients.custom()
  84.                 .setConnectionManager(cm)
  85.                 .setDefaultCredentialsProvider(credsProvider)
  86.                 .build();

  87.             //设置TCP keep alive,访问https网站时保持IP不切换
  88.             // SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setSoTimeout(3600000).build();
  89.             // CloseableHttpClient httpClient =  HttpClients.custom()
  90.             //    .setConnectionManager(cm)
  91.             //    .setDefaultCredentialsProvider(credsProvider)
  92.             //    .setDefaultSocketConfig(socketConfig)
  93.             //    .build();


  94.             AuthCache authCache = new BasicAuthCache();
  95.             authCache.put(proxy, new BasicScheme());
  96.             // 如果遇到407,可以设置代理认证 Proxy-Authenticate
  97.             // authCache.put(proxy, new BasicScheme(ChallengeState.PROXY));

  98.             HttpClientContext localContext = HttpClientContext.create();
  99.             localContext.setAuthCache(authCache);

  100.             httpResp = httpClient.execute(httpReq, localContext);

  101.             int statusCode = httpResp.getStatusLine().getStatusCode();

  102.             System.out.println(statusCode);

  103.             BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

  104.             String line = "";
  105.             while((line = rd.readLine()) != null) {
  106.                 System.out.println(line);
  107.             }
  108.         } catch (Exception e) {
  109.             e.printStackTrace();
  110.         } finally {
  111.             try {
  112.                 if (httpResp != null) {
  113.                     httpResp.close();
  114.                 }
  115.             } catch (IOException e) {
  116.                 e.printStackTrace();
  117.             }
  118.         }
  119.     }

  120.     /**
  121.      * 设置请求头
  122.      *
  123.      * @param httpReq
  124.      */
  125.     private static void setHeaders(HttpRequestBase httpReq) {

  126.         // 设置Proxy-Tunnel
  127.         // Random random = new Random();
  128.         // int tunnel = random.nextInt(10000);
  129.         // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

  130.         httpReq.setHeader("Accept-Encoding", null);

  131.     }


  132.     public static void doGetRequest() {
  133.         // 要访问的目标页面
  134.         String targetUrl = "https://httpbin.org/ip";


  135.         try {
  136.             HttpGet httpGet = new HttpGet(targetUrl);

  137.             doRequest(httpGet);
  138.         } catch (Exception e) {
  139.             e.printStackTrace();
  140.         }
  141.     }

  142.     public static void main(String[] args) {
  143.         doGetRequest();


  144.     }
  145. }
复制代码
示例参考来源于亿牛云,因之前的业务需求购买了代理,一直都还在使用。刚好分享这篇文章就一起分享给大家了,在代理方面有需求的可以试试他们家提供的隧道代理,是我使用众多代理商里面IP质量好,售后服务也最好的一家。代理的详细介


您需要登录后才可以回帖 登录 | 论坛注册

本版积分规则

QQ|Archiver|手机版|小黑屋|sitemap|鸿鹄论坛 ( 京ICP备14027439号 )  

GMT+8, 2025-1-26 15:48 , Processed in 0.058875 second(s), 11 queries , Redis On.  

  Powered by Discuz!

  © 2001-2025 HH010.COM

快速回复 返回顶部 返回列表