如何设计一个优秀的小程序开发平台?
631
2023-06-04
springBoot+webMagic实现网站爬虫的实例代码
前端时间公司项目需要抓取各类数据,py玩的不6,只好研究java爬虫方案,做一个总结。
开发环境:
springBoot 2.2.6、jdk1.8。
1、导入依赖
话不多说,直接上代码。
基础案例
下面代码说明以一个类似列表的页面为例
package com.crawler.project.proTask;
import com.alibaba.fastjson.JSONObject;
import org.springframework.scheduling.annotation.Scheduled;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
public class TaskProcessor implements PageProcessor {
/*
* 此方法为爬虫业务实现
* */
@Override
public void process(Page page) {
//1、爬虫任务获取到一个page 解析page上的列表
List
if (list.size() > 0){//说明为列表页面、需要解析列表中每个元素的链接,存入待获取page队列中
for (Selectable selectable : list) {
//遍历集合,将每个元素链接存入待获取page队列中
page.addTargetRequest(selectable.links().toString());
}
//同时将下一页的url存入队列中
page.addTargetRequest("下一页的url");
}else {
//此时为列表中单个元素对应的详情页
//在自定义方法中处理详细页,获取需要的数据进行处理。
handle(page);
}
}
private void handle(Page page) {
//例如 处理后的数据为一个JSONObject对象
JSONObject tmp = new JSONObject();
//将这个tmp交由自定义的TaskPipline类处理,若未自定义Pipline并设置到Spider参数中,框架会默认将tmp打印到控制台。
page.putField("obj",tmp);
}
/*
* 此方法为配置爬虫过程的一些参数
* */
private Site site = Site.me()
.setCharset("UTF-8")
.setTimeOut(60 * 1000)
.setRetrySleepTime(60 * 1000)
.setCycleRetryTimes(5);
@Override
public Site getSite() {
return site;
}
/*
设置定时任务,执行爬虫任务
* */
@Scheduled(initialDelay = 1 * 1000,fixedDelay = 2 * 1000)
public void process(){
System.out.println("开始执行爬虫抓取任务");
Spider.create(new TaskProcessor())//注意这里的类名要和当前类名对应
.addUrl("起始页url")
.addPipeline(new TaskPipeline()) //此处课自定义 数据处理类 (在handle()方法中有);
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(3)//此处设置线程数量(不宜过多,最好和列表页中列表元素数量一致)
.run();
}
}
package com.crawler.project.proTask;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class TaskPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.getAll() .size() > 0){
Object obj = resultItems.getAll().get("obj");
JSONObject jsonObject = JSON.parseObject(obj.toString());
//获取到JSONObject对象下面可进行自定义的业务处理。
}
}
}
特殊情况一
需根据链接-图片或文件
eg:在上面说到的详情页中含有iframe。
1、首先获取iframe的src
//获得iframe的src (这里要注意获得的src是绝对路径还是相对路径,相对路径需要拼接主站点url)
String src = html.css("css selector", "src").toString();
//采用jsoup解析
Document document = Jsoup.parse(new URL(src),1000);
//获得需要的元素
Element ele = document.select("css selector").last();
//获取需要-的文件的链接
String downUrl = ele.attr("href");
//根据链接-文件 返回一个文件的名称
String fileName = downloadFile(downUrl);
//通过url-文件
public String downloadFile(String fileUrl) throws FileNotFoundException{
try{
URL httpUrl = new URL(fileUrl);
String fileName = UUID.randomUUID().toString() + ".mp3";
File file = new File(this.STATIC_FILEPATH + fileName);
System.out.println("============保存文件方法被调用===============");
FileUtils.copyURLToFile(httpUrl,file);
return fileName;
}catch (Exception e){
e.printStackTrace();
return null;
}
}
特殊情况二
有些https站点 无法直接使用WebMagic默认的-器-,此时我们可以根据站点ssl类型修改-器。
在项目中创建一个包用于存放自定义(修改)的-器类
(!!!摘自webMagic框架中HttpClientDownloader,基于此类修改!!!)
/*
此方法中需要传入一个自定义的生成器(HttpClientGenerator)
*/
package com.crawler.project.spider_download;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.downloader.HttpClientRequestContext;
import us.codecraft.webmagic.downloader.HttpUriRequestConverter;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* The http downloader based on HttpClient.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map
//自定义的生成器(HttpClientGenerator)注意导入的应为自定义的HttpClientGenerator类,而不是WebMagic依赖中的HttpClientGenerator类。
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
if (task == null || task.getSite() == null) {
throw new NullPointerException("task or site can not be null");
}
CloseableHttpResponse httpResponse = null;
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail();
try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request);
logger.info("downloading page success {}", request.getUrl());
return page;
} catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request);
return page;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
if (proxyProvider != null && proxy != null) {
proxyProvider.returnProxy(proxy, page, task);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentTypTltmUgNyLle = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()){
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
page.setDownloadSuccess(true);
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
}
return charset;
}
}
然后在自定义的HttpClientGenerator类中修改有关ssl的参数
(!!!摘自webMagic框架中HttpClientGenerator,基于此类修改!!!)
/*
自定义的HttpClientGenerator生成器
*/
package com.sealion_crawler.project.spider_download;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.downloader.CustomRedirectStrategy;
import javax-.ssl.SSLContext;
import javax-.ssl.TrustManager;
import javax-.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
Registry
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100);
}
/*
此方法中设置ssl有关参数。
*/
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
http:// try {
return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
/*
下面为当前框架默认参数
SSLContext sc = SSLContext.getInstance("SSLv3");
可修改为需要的ssl参数类型
*/
SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
});
}
//解决post/redirect/post 302跳转问题
httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
socketConfigBuilder.setSoTimeout(site.getTimeOut());
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
if (site.isDisableCookieManagement()) {
httpClientBuilder.disableCookieManagement();
return;
}
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
for (Map.Entry
for (Map.Entry
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(domainEntry.getKey());
cookieStore.addCookie(cookie);
}
}
httpClientBuilder.setDefaultCookieStore(cookieStore);
}
}
好了,到这里 基于WebMagic框架 实现爬虫、包括jsoup的使用总结就到这里的。
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~