多线程.没办法,貌似多数程序员都写过这玩意
全部源码
| /* * To change this template, choose Tools | Templates * and open the template in the editor. */ package spider.spider; import java.net.URI; import java.util.Vector; import java.util.Enumeration; import java.util.regex.Pattern; import java.util.regex.Matcher; import spider.define.Event; import spider.define.Listener.EVENT; /** * * @author 石卓林 */ public class Spider implements spider.define.Spider { protected ThreadGroup group; protected SpiderThread[] thread; protected Vector<String> urls = new Vector<String>(); protected int index = 0; protected Vector<spider.define.Listener> listeners = new Vector<spider.define.Listener>(); protected String domains; public void setDomain(String domain) { domains = domain; } public Spider() { group = new ThreadGroup("spider-group"); } public int addURL(String url) { //验证域 try { URI u = new URI(url); boolean bool = false; String[] arrdomain = domains.split(","); for (int i = 0, length = arrdomain.length; i < length; i++) { if (arrdomain[i].trim().equals("*")) { bool = true; break; } if (u.getHost().endsWith(arrdomain[i].trim())) { bool = true; break; } } if (!bool) { return urls.size(); } } catch (java.net.URISyntaxException e) { e.printStackTrace(); return urls.size(); } if (!urls.contains(url)) { urls.add(url); } notifyListeners(new spider.define.Event(this, url), EVENT.ADDURL); return urls.size(); } public String getURL() { if (index >= urls.size()) { spider.define.Event event = new Event(this, null); notifyListeners(event, EVENT.GETURL); return null; } String retul = urls.get(index); index++; spider.define.Event event = new Event(this, retul); notifyListeners(event, EVENT.GETURL); return retul; } public void clearURL() { index = 0; urls.clear(); spider.define.Event event = new Event(this, null); notifyListeners(event, EVENT.CLEARURL); } public void addMessage(spider.define.WebPage webPage, String url) { //分析A标签获取链接地址 String content = webPage.getContent(); Pattern pattern = Pattern.compile("href\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); Matcher matcher = pattern.matcher(content); try { URI uri = new URI(url); while (matcher.find()) { String link = matcher.group(1); if (link != null) { link = link.replace("&", "&").replace(" ", "").trim(); if (!link.startsWith("#") && !link.startsWith("javascript:") && !link.contains("'")) { URI ulink = uri.resolve(link); this.addURL(ulink.toString()); } } } pattern = Pattern.compile("URL\\s*=\\s*(?:([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); matcher = pattern.matcher(content); while (matcher.find()) { String link = matcher.group(1); if (link != null) { link = link.replace("&", "&").trim(); if (!link.startsWith("#") && !link.startsWith("(")) { System.out.println(link); URI ulink = uri.resolve(link); this.addURL(ulink.toString()); } } } } catch (Exception e) { e.printStackTrace(); //notifyListeners(new Event(e, url), EVENT.ERROR); } notifyListeners(new Event(webPage, url), EVENT.MESSAGE); } public void addError(Exception e, String url) { spider.define.Event event = new Event(e, url); notifyListeners(event, EVENT.ERROR); } public void start(int count) { thread = new SpiderThread[count]; for (int i = 0, length = thread.length; i < length; i++) { thread[i] = new SpiderThread(group, String.valueOf(i), this); thread[i].isRun = true; thread[i].start(); } } public void stop() { for (int i = 0, length = thread.length; i < length; i++) { thread[i].isRun = false; } } public void addListener(spider.define.Listener listener) { listeners.add(listener); } public void removeListener(spider.define.Listener listener) { listeners.remove(listener); } public void notifyListeners(Event event, spider.define.Listener.EVENT eventType) { for (Enumeration<spider.define.Listener> e = listeners.elements(); e.hasMoreElements();) { spider.define.Listener listener = e.nextElement(); switch (eventType) { case MESSAGE: listener.message(event); break; case ERROR: listener.error(event); break; case ADDURL: listener.addURL(event); break; case GETURL: listener.getURL(event); break; default: listener.clearURL(event); } } } } |
Leave a Reply