import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
CloseableHttpClient httpClient;
static int bookId = 496;
Map proxyMap;//ip->端口
List ipList;//从这个list中读出ip,再由ip从map中读出端口
int i = 0;//根据这个从list中取出ip,换上对应的代理
public static void main(String[] args) {
Main m = new Main();
// List tagList = m.getTagList();
List tagList = new LinkedList();
// tagList.add("经典");
// tagList.add("日本文学");
// tagList.add("散文");
// tagList.add("中国文学");
// tagList.add("算法");
// tagList.add("童话");
// tagList.add("外国文学");
// tagList.add("文学");
// tagList.add("小说");
// tagList.add("漫画");
// tagList.add("诗词");
// tagList.add("心理学");
tagList.add("摄影");
tagList.add("理财");
tagList.add("经济学");
m.pullAndWrite(tagList,10);
}
public Main() {
// HttpHost proxy = new HttpHost("122.225.106.35",80);
// httpClient = HttpClients.custom().setProxy(proxy).build();
httpClient = HttpClients.createDefault();
setProxyMap();
}
public void setProxyMap() {
proxyMap = new HashMap();
ipList = new LinkedList();
proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171");
}
public List getTagList() {
HttpGet getTag = new HttpGet("http://book.douban.com/tag/");
getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse tagPageResponse = null;
String tagPageCode = null;//网页源码
try {
tagPageResponse = httpClient.execute(getTag);
tagPageCode = EntityUtils.toString(tagPageResponse.getEntity());
tagPageResponse.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
tagPageResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
Pattern p = Pattern.compile("class=\"tag\">(.*?)");
Matcher m = p.matcher(tagPageCode);
List resultTagList = new LinkedList();
while (m.find()) {
resultTagList.add(m.group(1));
}
return resultTagList;
}
/**
*
* @param tagList 要抓的图书的类别
* @param maxPageNum 每种图书最多抓取的页数
*/
public void pullAndWrite(List tagList,int maxPageNum) {
Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)"); //获取具体书籍网址的正则
Pattern bookAuthorRegex = Pattern.compile("(?s) 作者:.*?>(.*?)");//匹配作者
Pattern bookPublishRegex = Pattern.compile("出版社: (.*?)
");
Pattern bookIsbnRegex = Pattern.compile("ISBN: (.*?)
");
Pattern bookImgRegex = Pattern.compile("
threadList = new LinkedList();
while (m.find()) {
threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex));
findCount++;
}
//没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找
if (findCount == 0) {
break;
}
for (Thread thread:threadList) {
thread.start();
}
for (Thread thread:threadList) {
try {
thread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
nowPageNum++;
}
//一个类别爬完了再写入
new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start(); //另开一个线程写入文件
}
}
private void changeProxy() {
if (i >= ipList.size()) {
System.out.println("代理用完了,退出");
System.exit(0);
}
String ip = ipList.get(i++);
httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build();
System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip));
}
}