爬虫中常用JSoup处理Html,对于类似百度这样的搜索引擎,则需要配合fiddler使用,步骤如下
在IE浏览器中打开http://www.cncorpus.org/CnCindex.aspx,使用fiddler抓取,如下图
双击fidder中的捕获链接,获取整个连接信息
然后在http://www.cncorpus.org/CnCindex.aspx 后加入捕获的链接信息
即
http://www.cncorpus.org/CnCindex.aspx?__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwUKMTk4MDQ0MTE5OA9kFgICAw9kFgQCKQ8PFgIeB1Zpc2libGVnZBYIAgMPDxYCHgRUZXh0BTrnrKwx5YiwNTAw5p2h77yM5YWx5p%2Bl6K%2Bi5YiwNTI3MjjmnaHnrKblkIjopoHmsYLnmoTkvovlj6UhZGQCBQ8PFgIfAGhkZAIHDw8WAh8AaGRkAg0PDxYCHwBnZGQCLw8PFgIfAGdkFgoCAQ8PFgIfAGhkZAIDDw8WAh8AaGRkAgkPDxYCHwEFATFkZAILDw8WAh8BBQMxMDZkZAINDw8WAh8BBQU1MjcyOGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYKBQtSQmluZGV4d29yZAUKUkJsaWtlbW9kZQUKUkJsaWtlbW9kZQUOUkJmdWxsdGV4dG1vZGUFDlJCZnVsbHRleHRtb2RlBQxSYWRpb0J1dHRvbjMFDFJhZGlvQnV0dG9uMwUMUmFkaW9CdXR0b240BQ5DaGVja0JveENodWNodQUQQ2hlY2tCb3hLV0lDbW9kZeDFB%2FOXKuors7kNSBQvXV5bn9EPHGNvJgT94fUsjIhu&__VIEWSTATEGENERATOR=3A0BE18D&__EVENTVALIDATION=%2FwEWFQKNm9KcBQLYiuv%2FCwLzuO7zDQL3uO7zDQLV%2BYmkCgLZ%2BYmkCgKM54rGBgK8u9naBwKJlM7DBwKAg8rcDgKWzvT1CAKWzuCuBwK2q5qHDgK%2FxfDTAQLxqL%2BhAgLCpJSTBQKKn9X3AwKLlOLCBgLc%2F9LTBQL3t9jyBALZu%2BPjB6rMBlDgd9II8LdS4y%2BzUaXaUcHAjVptZHdcvx89wEPp&TextBoxCCkeywords=%E6%88%91&DropDownListPsize=500&Button1=%E6%A3%80++%E7%B4%A2&1=RBindexword&2=RadioButton4&txtTopage=
同理也可以获取下一页链接
http://www.cncorpus.org/CnCindex.aspx?__EVENTTARGET=LBnextpage&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwUKMTk4MDQ0MTE5OA9kFgICAw9kFgQCKQ8PFgIeB1Zpc2libGVnZBYIAgMPDxYCHgRUZXh0BTrnrKwx5YiwNTAw5p2h77yM5YWx5p%2Bl6K%2Bi5YiwNTI3MjjmnaHnrKblkIjopoHmsYLnmoTkvovlj6UhZGQCBQ8PFgIfAGhkZAIHDw8WAh8AaGRkAg0PDxYCHwBnZGQCLw8PFgIfAGdkFgoCAQ8PFgIfAGhkZAIDDw8WAh8AaGRkAgkPDxYCHwEFATFkZAILDw8WAh8BBQMxMDZkZAINDw8WAh8BBQU1MjcyOGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYKBQtSQmluZGV4d29yZAUKUkJsaWtlbW9kZQUKUkJsaWtlbW9kZQUOUkJmdWxsdGV4dG1vZGUFDlJCZnVsbHRleHRtb2RlBQxSYWRpb0J1dHRvbjMFDFJhZGlvQnV0dG9uMwUMUmFkaW9CdXR0b240BQ5DaGVja0JveENodWNodQUQQ2hlY2tCb3hLV0lDbW9kZeDFB%2FOXKuors7kNSBQvXV5bn9EPHGNvJgT94fUsjIhu&__VIEWSTATEGENERATOR=3A0BE18D&__EVENTVALIDATION=%2FwEWFQKNm9KcBQLYiuv%2FCwLzuO7zDQL3uO7zDQLV%2BYmkCgLZ%2BYmkCgKM54rGBgK8u9naBwKJlM7DBwKAg8rcDgKWzvT1CAKWzuCuBwK2q5qHDgK%2FxfDTAQLxqL%2BhAgLCpJSTBQKKn9X3AwKLlOLCBgLc%2F9LTBQL3t9jyBALZu%2BPjB6rMBlDgd9II8LdS4y%2BzUaXaUcHAjVptZHdcvx89wEPp
&TextBoxCCkeywords=%E6%88%91&DropDownListPsize=500&1=RBindexword&2=RadioButton4&txtTopage=
这里需要注意的是,下一页链接需要填入查询页cookie
代码如下
public static String getCookie(final String Url) {
StringBuffer sb = new StringBuffer();
try {
CookieManager manager = new CookieManager();
manager.setCookiePolicy(CookiePolicy.ACCEPT_ORIGINAL_SERVER);
CookieHandler.setDefault(manager);
URL url = new URL(Url);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.getHeaderFields();
CookieStore store = manager.getCookieStore();
List<HttpCookie> lCookies = store.getCookies();
for (HttpCookie cookie : lCookies) {
sb.append(URLDecoder.decode(cookie.getValue(), "UTF8"));
}
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
使用的时候,调用getCookie(findUrl)即可,其中findUrl是查找页url
思路是把链接保存到Html中,然后通过JSoup解析 这里需要的包是jsoup-1.8.1.jar
保存需要下边两个工具类
package com.star.crawlerweb;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FileUtil {
/* 文件操作类 */
public static final String SUFFIX = ".dat";
@SuppressWarnings({ "unchecked" })
public static void extractedOther(String sourcePath, String resultPath)
throws FileNotFoundException, IOException {
StringBuilder builder = readSource(sourcePath);
String pattenAttr = "\\/+[a-zA-Z]+";
String pattenall = "([\u4e00-\u9fa5]+)\\/+[a-zA-Z]+";
Map<String, Integer> mapattr = countWord(builder, pattenAttr);
Map<String, Integer> mapall = countWord(builder, pattenall);
FileUtil.writefile("=========分割线===========" + "\n", resultPath);
Iterator<?> iterattr = mapattr.entrySet().iterator();
while (iterattr.hasNext()) {
Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterattr
.next();
Object key = entry.getKey();
Object val = entry.getValue();
if (Integer.parseInt(val.toString()) >= 5) {
FileUtil.writefile(key.toString().replace("/", "") + " " + val
+ "\n", resultPath);
}
}
FileUtil.writefile("=========分割线===========" + "\n", resultPath);
Iterator<?> iterall = mapall.entrySet().iterator();
while (iterall.hasNext()) {
Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterall
.next();
Object key = entry.getKey();
Object val = entry.getValue();
if (Integer.parseInt(val.toString()) >= 5) {
FileUtil.writefile(key.toString().replaceAll("/", " ") + " "
+ val + "\n", resultPath);
}
}
}
public static final int BUFSIZE = 1024 * 8;
public static void mergeFiles(String outFile, String[] files) {
FileChannel outChannel = null;
System.out.println("Merge " + Arrays.toString(files) + " into "
+ outFile);
try {
outChannel = new FileOutputStream(outFile).getChannel();
for (String f : files) {
@SuppressWarnings("resource")
FileChannel fc = new FileInputStream(f).getChannel();
ByteBuffer bb = ByteBuffer.allocate(BUFSIZE);
while (fc.read(bb) != -1) {
bb.flip();
outChannel.write(bb);
bb.clear();
}
fc.close();
}
System.out.println("合并成功 ");
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (outChannel != null) {
outChannel.close();
}
} catch (IOException ignore) {
}
}
}
public static void resultCut() throws Exception {
String path = "resultAll.txt";
File file = new File(path);
if (file.exists()&&file.isFile()) {
file.delete();
}
bigFileCut("resultcrawler.txt");
System.out.println("去重结果保存在resultAll.txt中" + "\n");
System.out.println("词数统计成功,结果保存在" + "resultcount.txt" + "中");
FileUtil.deleteDirectory("htmlfind");
FileUtil.deleteDirectory("htmlnext");
FileUtil.deleteHtml("./");
@SuppressWarnings("resource")
Scanner scan = new Scanner(System.in);
System.out.println("是否统计词性出现次数?是:1 否:0\n");
int flag = scan.nextInt();
if (flag == 1) {
FileUtil.extractedOther("resultAll.txt", "resultcount.txt");
System.out.println("词数统计成功,结果保存在" + "resultcount.txt" + "中");
}
}
private static void bigFileCut(String path) throws Exception, IOException,
UnsupportedEncodingException {
Set<String> set = new HashSet<String>();
long maxsize = 1024 * 1024 * 50;
long size = 1024 * 1024 * 10;
File file = new File(path);
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
if (file.length() >= maxsize) {
FileUtil.divide(path, size);
for (int m = 0; m < num; m++) {
String pathdived = "./htmlfind/text" + m + ".dat";
System.out.println("开始提取第" + (m + 1) + "个文件……");
set.addAll(FileUtil.RemoveDuplicate(pathdived));
}
} else {
set.addAll(FileUtil.RemoveDuplicate(path));
}
for (String i : set) {
System.out.println("正在写入" + URLDecoder.decode(i, "utf-8") + "\n");
FileUtil.writefile(URLDecoder.decode(i, "utf-8") + "\n",
"resultAll.txt");
}
}
public static void deleteHtml(String path) {
File file = new File(path);
File temp = null;
File[] filelist = file.listFiles();
for (int i = 0; i < filelist.length; i++) {
temp = filelist[i];
if (temp.getName().endsWith("html")) {
temp.delete();}
}
}
}
@SuppressWarnings({ "unchecked" })
public static void extractedWord(String first, String sourcePath,
String resultPath) throws IOException {
StringBuilder builder = readSource(sourcePath);
String pattenWord = "([\u4e00-\u9fa5]+)";
Map<String, Integer> mapword = countWord(builder, pattenWord);
Iterator<?> iterword = mapword.entrySet().iterator();
while (iterword.hasNext()) {
Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterword
.next();
Object key = entry.getKey();
Object val = entry.getValue();
if (Integer.parseInt(val.toString()) >= 5) {
if (isKey(first, pattenWord, key) == false) {
FileUtil.writefile(first + "@" + key + ": " + val + "\n",
resultPath);
}
}
}
}
public static StringBuilder readSource(String sourcePath)
throws FileNotFoundException, IOException {
File file = new File(sourcePath);
FileReader fileReader = new FileReader(file);
BufferedReader reader = new BufferedReader(fileReader);
StringBuilder builder = new StringBuilder();
String line = "";
while ((line = reader.readLine()) != null) {
builder.append(line);
}
return builder;
}
public static boolean isKey(String first, String pattenWord, Object key) {
Pattern pattern = Pattern.compile(pattenWord);
Matcher matcher = pattern.matcher(key.toString());
Matcher matchers = pattern.matcher(first.toString());
while (matcher.find() && matchers.find()) {
String keymatch = matcher.group();
String firstmathc = matchers.group();
if (keymatch.equals(firstmathc)) {
return true;
}
}
return false;
}
public static Map<String, Integer> countWord(StringBuilder builder,
String patten) {
Pattern pattern = Pattern.compile(patten);
String content = builder.toString();
Matcher matcher = pattern.matcher(content);
Map<String, Integer> map = new HashMap<String, Integer>();
String word = "";
Integer times = 0;
while (matcher.find()) {
word = matcher.group();
if (map.containsKey(word)) {
times = map.get(word);
map.put(word, times + 1);
} else {
map.put(word, 1);
}
}
return map;
}
public static Set<String> RemoveDuplicate(String path) throws IOException,
UnsupportedEncodingException {
Set<String> set = new HashSet<String>();
List<String> resultall = FileUtil.readfile(path);
List<String> listTemp = new ArrayList<String>();
Iterator<String> it = resultall.iterator();
while (it.hasNext()) {
String i = it.next();
if (listTemp.contains(i)) {
it.remove();
} else {
listTemp.add(i);
}
}
for (String i : listTemp) {
set.add(i);
}
return set;
}
public static void divide(String name, long size) throws Exception {
File file = new File(name);
if (!file.exists() || (!file.isFile())) {
throw new Exception("指定文件不存在!");
}
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
String[] fileNames = new String[num];
FileInputStream in = new FileInputStream(file);
long end = 0;
int begin = 0;
for (int i = 0; i < num; i++) {
File outFile = new File("./htmlfind", "text" + i + SUFFIX);
FileOutputStream out = new FileOutputStream(outFile);
end += size;
end = (end > fileLength) ? fileLength : end;
for (; begin < end; begin++) {
out.write(in.read());
}
out.close();
fileNames[i] = outFile.getAbsolutePath();
System.out.println("第" + (i + 1) + "个子文件生成……");
}
in.close();
}
public static List<String> readfile(String path) throws IOException {
List<String> list = new ArrayList<String>();
File file = new File(path);
FileInputStream s = new FileInputStream(file);
@SuppressWarnings("resource")
BufferedReader reader = new BufferedReader(new InputStreamReader(s,
"utf-8"), 5 * 1024 * 1024);
String tempString = null;
while ((tempString = reader.readLine()) != null) {
String word = java.net.URLEncoder.encode(tempString, "utf-8");
list.add(word);
}
return list;
}
public static void writefile(String m, String path) {
try {
File file = new File(path);
if (!file.exists()) {
file.createNewFile();
}
FileWriter fileWritter = new FileWriter(file.getName(), true);
BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
bufferWritter.write(m);
bufferWritter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static boolean createDir(String destDirName) {
File dir = new File(destDirName);
if (dir.exists()) {
return false;
}
if (!destDirName.endsWith(File.separator)) {
destDirName = destDirName + File.separator;
}
if (dir.mkdirs()) {
return true;
} else {
return false;
}
}
public static boolean deleteDirectory(String sPath) {
if (!sPath.endsWith(File.separator)) {
sPath = sPath + File.separator;
}
File dirFile = new File(sPath);
if (!dirFile.exists() || !dirFile.isDirectory()) {
return false;
}
boolean flag = true;
File[] files = dirFile.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isFile()) {
flag = deleteFile(files[i].getAbsolutePath());
if (!flag)
break;
}
else {
flag = deleteDirectory(files[i].getAbsolutePath());
if (!flag)
break;
}
}
if (!flag)
return false;
if (dirFile.delete()) {
return true;
} else {
return false;
}
}
public static boolean deleteFile(String sPath) {
boolean flag = false;
File file = new File(sPath);
if (file.isFile() && file.exists()) {
file.delete();
flag = true;
}
return flag;
}
public static void clearFile() {
deleteDirectory("htmlfind");
deleteDirectory("htmlnext");
createDir("htmlfind");
createDir("htmlnext");
}
}
package com.star.crawlerweb;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
/*抓取html保存到本地 */
public class HtmlUtil {
public static void urlToHtm(String word, String findurl, String path) {
URL url = null;
try {
url = new URL(findurl);
} catch (MalformedURLException e) {
e.printStackTrace();
}
String charset = "utf-8";
int sec_cont = 1000;
try {
URLConnection url_con = url.openConnection();
url_con.setDoOutput(true);
url_con.setReadTimeout(10 * sec_cont);
url_con.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
InputStream htm_in = url_con.getInputStream();
String htm_str = InputStream2String(htm_in, charset);
saveHtml(path, htm_str);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void saveHtml(String filepath, String str) {
try {
OutputStreamWriter outs = new OutputStreamWriter(
new FileOutputStream(filepath, true), "utf-8");
outs.write(str);
outs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static String InputStream2String(InputStream in_st, String charset)
throws IOException {
BufferedReader buff = new BufferedReader(new InputStreamReader(in_st,
charset));
StringBuffer res = new StringBuffer();
String line = "";
while ((line = buff.readLine()) != null) {
res.append(line);
}
return res.toString();
}
}
主程序如下,由于网址限制短时间访问次数,写一个定时器,每隔20s爬取一次,代码如下
package com.star.crawlerweb;
import java.io.File;
import java.io.IOException;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.CookiePolicy;
import java.net.CookieStore;
import java.net.HttpCookie;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*Jsoup使用方法:http://www.cnblogs.com/xiaoMzjm/p/3899366.html */
public class WordSpider {
public static String savehtml = ".html";
public static int count = 1;
public static void main(String[] args) throws IOException {
FileUtil.clearFile();
TimerTask task = new TimerTask() {
int number = 0;
List<String> result = FileUtil.readfile("word.txt");
@Override
public void run() {
try {
synchronized (this) {
if (number <= result.size() - 1) {
getNativeWord(result.get(number));
number++;
} else {
cancel();
FileUtil.resultCut();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
};
Timer timer = new Timer();
long delay = 0;
long intevalPeriod = 1 * 20000 * 300;
System.out.println("每隔" + intevalPeriod / (1000 * 60) + "分钟爬取下一个词"
+ "\n");
timer.scheduleAtFixedRate(task, delay, intevalPeriod);
}
public static String getCookie(final String Url) {
StringBuffer sb = new StringBuffer();
try {
CookieManager manager = new CookieManager();
manager.setCookiePolicy(CookiePolicy.ACCEPT_ORIGINAL_SERVER);
CookieHandler.setDefault(manager);
URL url = new URL(Url);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.getHeaderFields();
CookieStore store = manager.getCookieStore();
List<HttpCookie> lCookies = store.getCookies();
for (HttpCookie cookie : lCookies) {
sb.append(URLDecoder.decode(cookie.getValue(), "UTF8"));
}
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
private static synchronized void getNativeWord(final String word)
throws IOException {
final String encondeword = java.net.URLDecoder.decode(word, "utf-8");
final String resultPath = "resultcrawler.html";
final String logpath = encondeword + ".html";
final String filefind = "./htmlfind/" + encondeword + savehtml;
final String findUrl = "http://www.cncorpus.org/CnCindex.aspx?"
+ "__VIEWSTATE=%2FwEPDwUKMTk4MDQ0MTE5OA9kFgICAw9kFgQCKQ8PFgIeB1Zpc2libGVoZBYIAgMPDxYCHgRUZXh0BTrnrKwx5YiwMTAw5p2h77yM5YWx5p%2Bl6K%2Bi5YiwNTI3MjjmnaHnrKblkIjopoHmsYLnmoTkvovlj6UhZGQCBQ8PFgIfAGhkZAIHDw8WAh8AaGRkAg0PDxYCHwBnZGQCLw8PFgIfAGhkFgoCAQ8PFgIfAGhkZAIDDw8WAh8AaGRkAgkPDxYCHwEFATFkZAILDw8WAh8BBQM1MjhkZAINDw8WAh8BBQU1MjcyOGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYKBQtSQmluZGV4d29yZAUKUkJsaWtlbW9kZQUKUkJsaWtlbW9kZQUOUkJmdWxsdGV4dG1vZGUFDlJCZnVsbHRleHRtb2RlBQxSYWRpb0J1dHRvbjMFDFJhZGlvQnV0dG9uMwUMUmFkaW9CdXR0b240BQ5DaGVja0JveENodWNodQUQQ2hlY2tCb3hLV0lDbW9kZf9jlvtMb1%2FyXrpEQQLtIFyLoPLGND86N0hSq0CED%2Brk"
+ "&__VIEWSTATEGENERATOR=3A0BE18D"
+ "&__EVENTVALIDATION=%2FwEWDgK3wKfCCgLYiuv%2FCwLzuO7zDQL3uO7zDQLV%2BYmkCgLZ%2BYmkCgKM54rGBgK8u9naBwKJlM7DBwKAg8rcDgKWzvT1CAKWzuCuBwK2q5qHDgK%2FxfDTAXWmVvcYknI3MwjcfE48IiMijAq3WW044PF7g9pBhtfu"
+ "&TextBoxCCkeywords="
+ word
+ "&DropDownListPsize=500&Button1=%E6%A3%80++%E7%B4%A2&1=RBindexword&2=RadioButton4";
System.out
.println("正在爬取" + "词:[" + encondeword + "]" + "首页数据⋯⋯" + "\n");
HtmlUtil.urlToHtm(encondeword, findUrl, filefind);
File in = new File(filefind);
Document doc = Jsoup.parse(in, "UTF-8", "");
Elements spanPoint = doc
.select("span[style=display:inline-block;font-family:宋体;font-size:11pt;width:1080px;]");
final Elements pageNumber = doc.select("span[id=LabelPageCount]");
final int number;
if (pageNumber.text().toString().equals("")) {
number = 0;
System.out.println("对不起,关键词:[" + encondeword + "]"
+ "未被索引,请使用模糊检索方式查询" + "\n");
} else {
number = Integer.parseInt(pageNumber.text().toString());
for (Element e : spanPoint) {
FileUtil.writefile(e.text() + "\n", resultPath);
FileUtil.writefile(e.text() + "\n", logpath);
}
System.out.println("词:[" + encondeword + "]" + "首页数据爬取成功" + "\n");
}
final String nextUrl = "http://www.cncorpus.org/CnCindex.aspx?__EVENTTARGET=LBnextpage&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwUKMTk4MDQ0MTE5OA9kFgICAw9kFgQCKQ8PFgIeB1Zpc2libGVnZBYIAgMPDxYCHgRUZXh0BTrnrKwx5YiwNTAw5p2h77yM5YWx5p%2Bl6K%2Bi5YiwNTI3MjjmnaHnrKblkIjopoHmsYLnmoTkvovlj6UhZGQCBQ8PFgIfAGhkZAIHDw8WAh8AaGRkAg0PDxYCHwBnZGQCLw8PFgIfAGdkFgoCAQ8PFgIfAGhkZAIDDw8WAh8AaGRkAgkPDxYCHwEFATFkZAILDw8WAh8BBQMxMDZkZAINDw8WAh8BBQU1MjcyOGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYKBQtSQmluZGV4d29yZAUKUkJsaWtlbW9kZQUKUkJsaWtlbW9kZQUOUkJmdWxsdGV4dG1vZGUFDlJCZnVsbHRleHRtb2RlBQxSYWRpb0J1dHRvbjMFDFJhZGlvQnV0dG9uMwUMUmFkaW9CdXR0b240BQ5DaGVja0JveENodWNodQUQQ2hlY2tCb3hLV0lDbW9kZeDFB%2FOXKuors7kNSBQvXV5bn9EPHGNvJgT94fUsjIhu&__VIEWSTATEGENERATOR=3A0BE18D&__EVENTVALIDATION=%2FwEWFQKNm9KcBQLYiuv%2FCwLzuO7zDQL3uO7zDQLV%2BYmkCgLZ%2BYmkCgKM54rGBgK8u9naBwKJlM7DBwKAg8rcDgKWzvT1CAKWzuCuBwK2q5qHDgK%2FxfDTAQLxqL%2BhAgLCpJSTBQKKn9X3AwKLlOLCBgLc%2F9LTBQL3t9jyBALZu%2BPjB6rMBlDgd9II8LdS4y%2BzUaXaUcHAjVptZHdcvx89wEPp"
+ "&TextBoxCCkeywords="
+ word
+ "&DropDownListPsize=500&1=RBindexword&2=RadioButton4&txtTopage=";
getCookie(findUrl);
TimerTask task = new TimerTask() {
@Override
public void run() {
try {
++count;
if (count <= number) {
System.out.println("正在爬取" + "词:[" + encondeword + "]"
+ "第" + count + "页数据⋯⋯" + "\n");
String filenext = "./htmlnext/" + encondeword + count
+ savehtml;
HtmlUtil.urlToHtm(encondeword, nextUrl, filenext);
File innext = new File(filenext);
Document docnext = Jsoup.parse(innext, "UTF-8", "");
Elements spannext = docnext
.select("span[style=display:inline-block;font-family:宋体;font-size:11pt;width:1080px;]");
System.out.println("词:[" + encondeword + "]" + "第"
+ count + "页据爬取成功" + "\n");
for (Element e : spannext) {
FileUtil.writefile(e.text() + "\n", resultPath);
FileUtil.writefile(e.text() + "\n", logpath);
}
} else if (count > number) {
count = 1;
if (number != 0) {
System.out.println("词:[" + encondeword + "]"
+ "标注语料已经抓取完成,结果保存在resultcrawler.html中"
+ "\n");
FileUtil.extractedWord(encondeword, logpath,
"resultcount.txt");
}
cancel();
}
} catch (IOException e) {
e.printStackTrace();
}
}
};
Timer timer = new Timer();
long delay = 0;
long intevalPeriod = 1 * 20000;
timer.scheduleAtFixedRate(task, delay, intevalPeriod);
}
}
抓取的html在htmlfind和htmlnext文件夹下,结果保存在result.txt中
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。