爱萝莉真是太好了 爱萝莉真是太好了

努力让自己变得更优秀呀!

目录
Java爬取美女图片妹子图(mzitu)
/  

Java爬取美女图片妹子图(mzitu)

1.准备图片网站 https://www.mzitu.com/

2.本篇是爬取妹子图网站全站图片的代码,采用 Java 语言编写,另外妹子图有防爬措施,本篇采用极光 ip 客户端自动切换 ip 实现,考虑切换 ip 时会有访问异常,代码上相关下载处做了无限重试处理。

 1	<!--相关依赖-->
 2	<dependency>
 3    		<groupId>org.jsoup</groupId>
 4    		<artifactId>jsoup</artifactId>
 5    		<version>1.11.3</version>
 6	</dependency>
 7	<dependency>
 8		<groupId>cn.hutool</groupId>
 9		<artifactId>hutool-all</artifactId>
10		<version>5.1.0</version>
11	</dependency>

3.爬取全站图片时,妹子图分为两个地址

https://www.mzitu.com/all/
https://www.mzitu.com/old/

4.直接上代码了 ,文末有可直接下载的.java 文件

  1package com.Yang;
  2
  3import java.io.File;
  4import java.util.UUID;
  5import java.util.concurrent.ExecutorService;
  6import java.util.concurrent.LinkedBlockingQueue;
  7import java.util.concurrent.ThreadPoolExecutor;
  8import java.util.concurrent.TimeUnit;
  9
 10import org.jsoup.Jsoup;
 11import org.jsoup.nodes.Document;
 12import org.jsoup.nodes.Element;
 13import org.jsoup.select.Elements;
 14import org.junit.runner.RunWith;
 15import org.springframework.boot.test.context.SpringBootTest;
 16import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
 17
 18import cn.hutool.core.thread.ThreadUtil;
 19import cn.hutool.http.HttpRequest;
 20import cn.hutool.http.HttpResponse;
 21import cn.hutool.http.HttpUtil;
 22import lombok.extern.slf4j.Slf4j;
 23
 24@RunWith(SpringJUnit4ClassRunner.class)
 25@SpringBootTest
 26@Slf4j
 27public class Meizitu {
 28
 29    static String FileDir = "D:\\mzitu\\";//图片保存位置
 30    static String UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400";
 31    static int queueMaxSize = 3;//同时处理最大队列数量
 32    static LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueMaxSize);
 33    static ExecutorService threadPool = new ThreadPoolExecutor(queueMaxSize, queueMaxSize, 0L, TimeUnit.MILLISECONDS,queue);
 34
 35    // 目的:传递一个url(http的地址),返回对应地址下的HTML的文档
 36    public static String getHtml(String url){
 37  
 38        HttpRequest createGet = HttpUtil.createGet(url);
 39        createGet.header("User-Agent", UserAgent)
 40        		 .timeout(10000);
 41  
 42        HttpResponse response;
 43        while(true) {
 44        	try {
 45        		response = createGet.execute();
 46        		break;
 47			} catch (Exception e) {
 48	
 49			}
 50        }
 51        int status = response.getStatus();
 52        log.info("[{}] , 返回状态码:{}" , url , status);
 53        if(status == 301 || status == 302) {
 54        	 HttpRequest createGet2 = HttpUtil.createGet(response.header("location"));
 55        	 createGet2.header("User-Agent", UserAgent)
 56        	 			.timeout(-1);
 57             HttpResponse response2 = createGet2.execute();
 58             log.info("[{}] , 返回状态码:{}" , response.header("location") , response2.getStatus());
 59             return response2.body();
 60        }
 61        if(status == 404) {
 62        	return "";
 63        }
 64        return response.body();
 65    }
 66
 67    public static void downLoadImg(String url,String dir){
 68        HttpRequest createGet = HttpUtil.createGet(url);
 69        createGet.header("User-Agent", UserAgent);
 70        createGet.header("Referer","https://www.mzitu.com/");
 71        createGet.timeout(3000);
 72        HttpResponse response = createGet.execute();
 73        log.info("[{}] , 返回状态码:{}" , url , response.getStatus());
 74        if(response.getStatus() == 429) {
 75        	log.info("频率过快 ,两秒后重试");
 76        	ThreadUtil.sleep(2000);
 77        	downLoadImg(url,dir);
 78        }
 79	if(response.getStatus() == 404) {
 80        	return;
 81        }
 82        String ext = url.substring(url.lastIndexOf("."));
 83        String img = "";
 84        img = UUID.randomUUID().toString()+ext;
 85     // 先建文件夹
 86        File file = new File(FileDir + dir.replaceAll("\\:", "").replaceAll("\\?", "").replaceAll("\"", "") + "\\");
 87        synchronized (file) {
 88            if (!file.exists()){
 89                file.mkdirs();
 90            }
 91		}
 92        response.writeBody(new File(FileDir + dir.replaceAll("\\:", "").replaceAll("\\?", "").replaceAll("\"", "") + "\\" + img));
 93    }
 94 
 95  
 96
 97    public static void submit(int index ,int page, String url , String dir) {
 98    	String url_cp = url;
 99    	String dir_cp = dir;
100    	while(true) {
101    		ThreadUtil.sleep(20);
102    		if(queue.size() < queueMaxSize) {
103  
104    			threadPool.execute(new Runnable() {
105    	
106    				@Override
107    				public void run() {
108    					int count = 0 ;
109    		
110    					while(true) {
111    						try {
112    							downLoadImg(url_cp,dir_cp);
113    							return;
114        					} catch (Exception e) {
115        						count++;
116        						log.error("第" + index + "条,page:" + page +"下载失败" + count + "次" , e);
117        					}
118    					}
119    				}
120    			});
121    			return;
122        	}
123    	}
124    }
125  
126    public static void main(String[] args) {
127  
128    	// 初始化url
129        String url = "https://www.mzitu.com/old/";
130//        String url = "https://www.baidu.com";
131        // 获取所有页面
132        String html = getHtml(url);
133        // 解析url
134        Document document = Jsoup.parse(html);
135        Elements elements = document.select("[target=_blank]");
136        log.info("数据共{}条" , elements.size());
137        int index = 0;
138        for (Element element : elements) {
139        	index++;
140        	log.info("开始执行第{}条" , index);
141            String albumUrl = element.attr("href");
142            // 遍历解析每一个URL,得到每一个相册的html
143            String eachAlblumHtml = getHtml(albumUrl);   
144            Document eachAlblumDocument = Jsoup.parse(eachAlblumHtml);
145            Elements ele_a = eachAlblumDocument.getElementsByClass("pagenavi").get(0).getElementsByTag("a");
146            //找到最大页码
147            int pageSize = Integer.valueOf(ele_a.get(ele_a.size() -2).text());
148            log.info("最大页码:{}" , pageSize);
149            //找图片url
150            Elements imgElements = eachAlblumDocument.select(".main-image img");
151            if (imgElements.size() > 0) {
152                String imgSrc = imgElements.get(0).attr("src");
153                String baseImg = imgSrc.substring(0, imgSrc.length() - 6);
154          
155                for(int page = 1 ; page<pageSize ; page++) {
156                	int value = page;
157                	try {
158                		if(value < 10) {
159                    		submit(index, value, baseImg + "0" + value + ".jpg", element.text());
160                    	}else {
161                        	submit(index, value, baseImg + value + ".jpg", element.text());
162                    	}
163					} catch (Exception e) {
164						log.error("第" + index + "条,page:" + value +"下载失败######" , e);
165						page--;
166					}
167          
168                }
169                while(queue.size() != 0) {
170                	ThreadUtil.sleep(20);
171                }
172            }
173        }
174    }
175}
176

5.源码文件下载

Meizitu.7z

6.最后图片下载出来的效果

微信图片 20200815102832.png


标题:Java爬取美女图片妹子图(mzitu)
地址:https://www.1-love.cn/get-mzitu.html