作为一位Java爬虫的初学者,分享一下自己的心得。 所用到的jar包
jsoup-1.7.3.jar 个人认为爬虫的实现机制: 获取Docume对象—>获取节点—>输出或者持久化
获取页面的图片地址
获取Docume对象—>获取Img元素—>输出地址
1 package com.cn.basic;
2
3 import java.io.IOException;
4 import org.jsoup.Jsoup;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.nodes.Element;
7 import org.jsoup.select.Elements;
8
9 public class ImageDemo1 {
10
11 public static void Get_Url(String htmlUrl, String path) {
12
13 try {
14 Document doc = Jsoup.connect(htmlUrl).get();
15
16 Element body = doc.body();
17 Elements elements = body.select("img");
18
19 String src = "";
20 for (Element element : elements) {
21
22 src = element.attr("src");
23
24 System.out.println(path + src);
25
26 }
27
28 System.out.println("elements-size: " + elements.size());
29
30 } catch (IOException e) {
31 e.printStackTrace();
32 }
33
34 }
35
36 public static void main(String[] args) {
37
38 String url = "http://pic.netbian.com/4kkatongdongman/index_2.html";
39 String path = "http://pic.netbian.com";
40 Get_Url(url, path);
41
42 }
43
44 }
将图片写入本地
获取Docume对象—>获取Img元素—>将图片保存本地
1 package com.cn.basic;
2
3 import java.io.ByteArrayOutputStream;
4 import java.io.File;
5 import java.io.FileOutputStream;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.net.HttpURLConnection;
9 import java.net.URL;
10 import java.util.Date;
11
12 import org.jsoup.Jsoup;
13 import org.jsoup.nodes.Document;
14 import org.jsoup.nodes.Element;
15 import org.jsoup.select.Elements;
16
17 public class ImageDemo2 {
18
19 public static void saveImage(String htmlUrl, String path) {
20
21 try {
22 Document doc = Jsoup.connect(htmlUrl).get();
23 Element body = doc.body();
24 Elements elements = body.select("img");
25
26 String outputFilePath="E:/pythonTest/javaPython/imgs/";
27 String src = "";
28
29 HttpURLConnection conn = null;
30 InputStream inStream = null;
31 byte[] data = null;
32 String filePath = null;
33 FileOutputStream outStream = null;
34
35 Long startTime=new Date().getTime();
36
37 for (Element element : elements) {
38
39 src = element.attr("src");
40
41 System.out.println(path + src);
42 // new一个URL对象
43
44 if (!src.contains(".jpg")) {
45 continue;
46 }
47
48 URL url = new URL(path + src);
49 // 打开链接
50 conn = (HttpURLConnection) url.openConnection();
51 // 设置请求方式为"GET"
52 conn.setRequestMethod("GET");
53 // 超时响应时间为5秒
54 conn.setConnectTimeout(5 * 1000);
55 // 通过输入流获取图片数据
56 inStream = conn.getInputStream();
57 // 得到图片的二进制数据,以二进制封装得到数据,具有通用性
58 data = readInputStream(inStream);
59 // new一个文件对象用来保存图片,默认保存当前工程根目录
60 filePath = outputFilePath + System.currentTimeMillis() + ".jpg";
61 // 创建输出流
62 outStream = new FileOutputStream(new File(filePath));
63 // 写入数据
64 outStream.write(data);
65 // 关闭输出流
66 outStream.close();
67
68 }
69 System.out.println(elements.size());
70 System.out.println("读写速度:"+(new Date().getTime()-startTime)+"毫秒");
71
72
73 } catch (IOException e) {
74 e.printStackTrace();
75 } catch (Exception e) {
76 e.printStackTrace();
77 }
78
79 }
80
81 public static byte[] readInputStream(InputStream inStream) throws Exception {
82 ByteArrayOutputStream outStream = new ByteArrayOutputStream();
83 // 创建一个Buffer字符串
84 byte[] buffer = new byte[1024];
85 // 每次读取的字符串长度,如果为-1,代表全部读取完毕
86 int len = 0;
87 // 使用一个输入流从buffer里把数据读取出来
88 while ((len = inStream.read(buffer)) != -1) {
89 // 用输出流往buffer里写入数据,中间参数代表从哪个位置开始读,len代表读取的长度
90 outStream.write(buffer, 0, len);
91 }
92 // 关闭输入流
93 inStream.close();
94 // 把outStream里的数据写入内存
95 return outStream.toByteArray();
96 }
97
98 public static void main(String[] args) {
99 String url = "http://pic.netbian.com/4kkatongdongman/index_2.html";
100 String path = "http://pic.netbian.com";
101 saveImage(url, path);
102 }
103
104 }