java爬妹子图

AItechnology 发表于 2020-7-27 15:53

之前在论坛看到一位大佬发了一个爬妹子图的爬虫程序，地址https://www.52pojie.cn/forum.php?mod=viewthread&tid=1227603&extra=page%3D1%26filter%3Dtypeid%26typeid%3D192
受这篇帖子的启发，利用其思路，自己写了一个爬去妹子图的java方法体，可以爬取https://www.mzitu.com的妹子图，哈哈，仅此分享给大家。另外，有一位大佬直接分享了成品，我这里的呢，就是大家动动脑子，自己爬取了。大佬作品地址https://www.52pojie.cn/forum.php?mod=viewthread&tid=828175&extra=page%3D1%26filter%3Dtypeid%26typeid%3D192
那么先来说说思路吧。(思路得益于低一个贴出代码的帖子)第一步:先利用jsoup获取主页上图片的链接地址；第二步:进入主页中地址的二级地址，判断是否是图片地址，如果是，利用保存程序保存图片，如果不是，进入下一级地址，如此往复，直到爬取完图片。
那么上代码了:

public static void getTestContent() {
ArrayList<String> filePath = new ArrayList<>();//构建文件路径集合
   ArrayList<String> fileId= new ArrayList<>();//构建独一无二的文件名
String url ="https://www.mzitu.com/";
try {
//解析第一级地址中返回的链接地址，可以利用System.out.println(doc.toString()); 查看网页源代码，确定如何利用jsoup获取下一级地址
Document doc = Jsoup.connect(url).get();
Elements el = doc.getElementsByClass("postlist");
Elements li = el.select("ul").select("li").select("a");
int num=0;
for (Element element : li){//遍历获取的地址
num=num+1;//统计第一级地址的数量
            String url1 = String.valueOf(element.attr("href"));//获取第二层的href
            if ((num%2)==0){//有些href是广告和图片链接最主要的区别是含有字母p 所以判断一下
               continue;
            }
         String ID = url1.substring(22,28);//获取文件ID ，这个值用来保存图片，但这里只是利用一级地址生成文件前缀

         String url2=url+ID;
         //下面内容为获取二级地址的网络请求
         Document document = Jsoup.connect(url2).get();
         Elements ending = document.getElementsByClass("pagenavi");
         Elements img = ending.select("a");
         int tmpnum=0;
         int lastnum=0;
         for(Element element2:img) {
            tmpnum=tmpnum+1;//通过导航栏目统计二级网页中同一个系列的图片数目
         }
         //下面就是获取最终同一个系列有多少图片
         if(tmpnum>2) {
            lastnum = Integer.valueOf(StringTools.getLastStringBySeperate(img.get(tmpnum-2).attr("href").toString(),"/"));//根据最后一张图的序号判断一个系列有多少图片
         }else {
            if(tmpnum==0) {lastnum=0;}
            if(tmpnum==1) {lastnum=3;}
         }

            //下面是获取最终每张图片的下载地址
            for(int i=2;i<lastnum;i++) {
String url3=url2+"/"+String.valueOf(i);//生成同一个系列每张图的地址
Document docss = Jsoup.connect(url3).get();
Elements elss = docss.getElementsByClass("main-image");
Elements liss = elss.select("p").select("a").select("img");

            String url4 = String.valueOf(liss.get(0).attr("src"));
            filePath.add(url4);//保存妹子图的地址
            System.out.println(url4);

            try {
Thread.sleep(500);//延迟请求，服务器似乎存在防止拒绝服务攻击的策略，快速大量请求会被服务器拒绝
} catch (InterruptedException e) {
e.printStackTrace();
}

if(i<10) {
fileId.add(ID+"0"+String.valueOf(i));//保证文件名统一长度
}else {
fileId.add(ID+String.valueOf(i));//生成唯一的文件名，需要之前的文件前缀加序号
}

}
}
SavePngJoup(filePath, fileId);//利用地址和文件名保存图片
   } catch (IOException e) {
         System.out.println("抓取文件过程中出错，可能是请求过于频繁服务器拒绝响应");
         e.printStackTrace();
   }finally {
         System.out.println("运行结束");
   }

}

//使用这个方法前请自己在D盘建立test文件夹，否则会报错
public static void SavePngJoup(ArrayList<String> filePath,ArrayList<String> fileId){
   ByteArrayOutputStream byteOutputStream = null;
   FileOutputStream fileOutputStream = null;
   DataInputStream dataInputStream = null;
   //遍历保存图片
for (int index = 0 ; index < filePath.size() ; index++){
         String url = null;
         try {
            url = new String(filePath.get(index));//读取地址
//下面的网络请求设置服务端，否则，请求次数多了服务器拒绝响应
Connection connection = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6").referrer("https://www.mzitu.com/");
            Connection.Response response = connection.method(Connection.Method.GET).ignoreContentType(true).timeout(3*1000).execute();
            BufferedInputStream bufferedInputStream = response.bodyStream();
            try {
Thread.sleep(500);//模拟人操作，人操作总会耗时，具体延迟多少看个人自己测试了。
//另外，这里等待时间可以是500左右的随机数，这样就更真实模拟人的点击
} catch (InterruptedException e) {
e.printStackTrace();
}
            dataInputStream = new DataInputStream(bufferedInputStream);
            File file = new File("D:/test/"+fileId.get(index)+".jpg");
            fileOutputStream = new FileOutputStream(file);
            byteOutputStream = new ByteArrayOutputStream();
            byte[] buffer = new byte;
            int length;
            /*写入字节*/
while ((length = dataInputStream.read(buffer))>0){
               byteOutputStream.write(buffer,0,length);
            }
            byte[] context = byteOutputStream.toByteArray();
            fileOutputStream.write(context);
            System.out.println("保存了"+file.getName());
         } catch (MalformedURLException e) {
            System.out.println("URL转换错误");
e.printStackTrace();
         } catch (FileNotFoundException e) {
            System.out.println("无法创建文件");
e.printStackTrace();
         } catch (IOException e) {
            System.out.println("文件传输出现错误");
e.printStackTrace();
         }

   }
/*关闭流*/
try {
         fileOutputStream.close();
         byteOutputStream.close();
         dataInputStream.close();
   } catch (IOException e) {
         System.out.println("关闭流出现错误");
e.printStackTrace();
   }
}

最后贴上自己写的一个工具类

public class StringTools {
//parameter : str
//the string you need to process
//parameter : fenggefu
//the seperate code in string, such as the "," in the string"Type=neme, Date=20200725"
//for example: str="haha=haha,didi=didi"
//res=StringTools.seperateString(str,",") will return an array res="haha=haha",res="didi=didi"
public static String[] seperateString(String str,String fenggefu) {
char fengefu=fenggefu.charAt(0);
int len=str.length();
char ss[]=str.toCharArray();
int numberofCode=1;
for(int i=0;i<len;i++) {
if(ss==fengefu) {numberofCode=numberofCode+1;}
}
String properties[];
if(numberofCode>0) {
int tmpnum=0;
properties = new String;
for(int i=0;i<numberofCode;i++) {
properties="";
}
for(int i=0;i<len;i++) {
if(ss!=fengefu) {
properties=properties+ss;
}
if(ss==fengefu) {
tmpnum=tmpnum+1;
}
}
}else {
properties = new String;
properties=str;
}
return properties;
}

public static String getLastStringBySeperate(String str,String fenggefu) {
String[] reStrings=seperateString(str, fenggefu);
return reStrings;
}
}

这个工具类可以把字符串分为一个数组。在本程序中之用为了获取"/"分隔的最后一组数字。有以上代码，只要在主方法调用就可以爬图了。图片很多，爬取需要时间，请耐心等待。

xccxvb 发表于 2020-7-27 16:25

大兵马元帅发表于 2020-7-27 16:09
Python能不能不要这么骄傲了，你会饿别人也会，你不会的别人也会。

可是python不需要写这么多代码哦，不同语言各有各的优点吧，有些东西也是只有python才比较擅长的。身为一个程序员要对自己擅长的语言有足够的信心！

大兵马元帅 发表于 2020-7-27 16:09

Python能不能不要这么骄傲了，你会饿别人也会，你不会的别人也会。

MySeeker 发表于 2020-7-27 16:13

人均绅士{:301_997:}

qujf 发表于 2020-7-27 16:32

xccxvb 发表于 2020-7-27 16:25
可是python不需要写这么多代码哦，不同语言各有各的优点吧，有些东西也是只有python才比较擅长的。身为一 ...

嗯，我觉得你所的对。大部分功能每一种后台语言都可以做，就是看谁的效率和性能了。

meteornk 发表于 2020-7-27 16:33

坚持学习java开发

红烧大白菜 发表于 2020-7-27 16:40

还是喜欢直接打包，简单。

daymissed 发表于 2020-7-27 16:56

好东西，收藏备用。感谢分享

比鸽爱鸟A 发表于 2020-7-27 16:58

666,这个好，感谢楼主

我有三个艾琳 发表于 2020-7-27 17:19

当年学python就为了爬网址:lol

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

java爬妹子图