多线程.没办法,貌似多数程序员都写过这玩意
全部源码
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package spider.spider;
import java.net.URI;
import java.util.Vector;
import java.util.Enumeration;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import spider.define.Event;
import spider.define.Listener.EVENT;
/**
*
* @author 石卓林
*/
public class Spider implements spider.define.Spider {
protected ThreadGroup group;
protected SpiderThread[] thread;
protected Vector urls = new Vector();
protected int index = 0;
protected Vector listeners = new Vector();
protected String domains;
public void setDomain(String domain) {
domains = domain;
}
public Spider() {
group = new ThreadGroup("spider-group");
}
public int addURL(String url) {
//验证域
try {
URI u = new URI(url);
boolean bool = false;
String[] arrdomain = domains.split(",");
for (int i = 0, length = arrdomain.length; i < length; i++) {
if (arrdomain[i].trim().equals("*")) {
bool = true;
break;
}
if (u.getHost().endsWith(arrdomain[i].trim())) {
bool = true;
break;
}
}
if (!bool) {
return urls.size();
}
} catch (java.net.URISyntaxException e) {
e.printStackTrace();
return urls.size();
}
if (!urls.contains(url)) {
urls.add(url);
}
notifyListeners(new spider.define.Event(this, url), EVENT.ADDURL);
return urls.size();
}
public String getURL() {
if (index >= urls.size()) {
spider.define.Event event = new Event(this, null);
notifyListeners(event, EVENT.GETURL);
return null;
}
String retul = urls.get(index);
index++;
spider.define.Event event = new Event(this, retul);
notifyListeners(event, EVENT.GETURL);
return retul;
}
public void clearURL() {
index = 0;
urls.clear();
spider.define.Event event = new Event(this, null);
notifyListeners(event, EVENT.CLEARURL);
}
public void addMessage(spider.define.WebPage webPage, String url) {
//分析A标签获取链接地址
String content = webPage.getContent();
Pattern pattern = Pattern.compile("href\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
Matcher matcher = pattern.matcher(content);
try {
URI uri = new URI(url);
while (matcher.find()) {
String link = matcher.group(1);
if (link != null) {
link = link.replace("&", "&").replace(" ", "").trim();
if (!link.startsWith("#") && !link.startsWith("javascript:") && !link.contains("'")) {
URI ulink = uri.resolve(link);
this.addURL(ulink.toString());
}
}
}
pattern = Pattern.compile("URL\\s*=\\s*(?:([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
matcher = pattern.matcher(content);
while (matcher.find()) {
String link = matcher.group(1);
if (link != null) {
link = link.replace("&", "&").trim();
if (!link.startsWith("#") && !link.startsWith("(")) {
System.out.println(link);
URI ulink = uri.resolve(link);
this.addURL(ulink.toString());
}
}
}
} catch (Exception e) {
e.printStackTrace();
//notifyListeners(new Event(e, url), EVENT.ERROR);
}
notifyListeners(new Event(webPage, url), EVENT.MESSAGE);
}
public void addError(Exception e, String url) {
spider.define.Event event = new Event(e, url);
notifyListeners(event, EVENT.ERROR);
}
public void start(int count) {
thread = new SpiderThread[count];
for (int i = 0, length = thread.length; i < length; i++) {
thread[i] = new SpiderThread(group, String.valueOf(i), this);
thread[i].isRun = true;
thread[i].start();
}
}
public void stop() {
for (int i = 0, length = thread.length; i < length; i++) {
thread[i].isRun = false;
}
}
public void addListener(spider.define.Listener listener) {
listeners.add(listener);
}
public void removeListener(spider.define.Listener listener) {
listeners.remove(listener);
}
public void notifyListeners(Event event, spider.define.Listener.EVENT eventType) {
for (Enumeration e = listeners.elements(); e.hasMoreElements();) {
spider.define.Listener listener = e.nextElement();
switch (eventType) {
case MESSAGE:
listener.message(event);
break;
case ERROR:
listener.error(event);
break;
case ADDURL:
listener.addURL(event);
break;
case GETURL:
listener.getURL(event);
break;
default:
listener.clearURL(event);
}
}
}
}
Leave a Reply