多线程.没办法,貌似多数程序员都写过这玩意
全部源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* * To change this template, choose Tools | Templates * and open the template in the editor. */ package spider.spider; import java.net.URI; import java.util.Vector; import java.util.Enumeration; import java.util.regex.Pattern; import java.util.regex.Matcher; import spider.define.Event; import spider.define.Listener.EVENT; /** * * @author 石卓林 */ public class Spider implements spider.define.Spider { protected ThreadGroup group; protected SpiderThread[] thread; protected Vector<String> urls = new Vector<String>(); protected int index = 0; protected Vector<spider.define.Listener> listeners = new Vector<spider.define.Listener>(); protected String domains; public void setDomain(String domain) { domains = domain; } public Spider() { group = new ThreadGroup("spider-group"); } public int addURL(String url) { //验证域 try { URI u = new URI(url); boolean bool = false; String[] arrdomain = domains.split(","); for (int i = 0, length = arrdomain.length; i < length; i++) { if (arrdomain[i].trim().equals("*")) { bool = true; break; } if (u.getHost().endsWith(arrdomain[i].trim())) { bool = true; break; } } if (!bool) { return urls.size(); } } catch (java.net.URISyntaxException e) { e.printStackTrace(); return urls.size(); } if (!urls.contains(url)) { urls.add(url); } notifyListeners(new spider.define.Event(this, url), EVENT.ADDURL); return urls.size(); } public String getURL() { if (index >= urls.size()) { spider.define.Event event = new Event(this, null); notifyListeners(event, EVENT.GETURL); return null; } String retul = urls.get(index); index++; spider.define.Event event = new Event(this, retul); notifyListeners(event, EVENT.GETURL); return retul; } public void clearURL() { index = 0; urls.clear(); spider.define.Event event = new Event(this, null); notifyListeners(event, EVENT.CLEARURL); } public void addMessage(spider.define.WebPage webPage, String url) { //分析A标签获取链接地址 String content = webPage.getContent(); Pattern pattern = Pattern.compile("href\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); Matcher matcher = pattern.matcher(content); try { URI uri = new URI(url); while (matcher.find()) { String link = matcher.group(1); if (link != null) { link = link.replace("&", "&").replace(" ", "").trim(); if (!link.startsWith("#") && !link.startsWith("javascript:") && !link.contains("'")) { URI ulink = uri.resolve(link); this.addURL(ulink.toString()); } } } pattern = Pattern.compile("URL\\s*=\\s*(?:([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); matcher = pattern.matcher(content); while (matcher.find()) { String link = matcher.group(1); if (link != null) { link = link.replace("&", "&").trim(); if (!link.startsWith("#") && !link.startsWith("(")) { System.out.println(link); URI ulink = uri.resolve(link); this.addURL(ulink.toString()); } } } } catch (Exception e) { e.printStackTrace(); //notifyListeners(new Event(e, url), EVENT.ERROR); } notifyListeners(new Event(webPage, url), EVENT.MESSAGE); } public void addError(Exception e, String url) { spider.define.Event event = new Event(e, url); notifyListeners(event, EVENT.ERROR); } public void start(int count) { thread = new SpiderThread[count]; for (int i = 0, length = thread.length; i < length; i++) { thread[i] = new SpiderThread(group, String.valueOf(i), this); thread[i].isRun = true; thread[i].start(); } } public void stop() { for (int i = 0, length = thread.length; i < length; i++) { thread[i].isRun = false; } } public void addListener(spider.define.Listener listener) { listeners.add(listener); } public void removeListener(spider.define.Listener listener) { listeners.remove(listener); } public void notifyListeners(Event event, spider.define.Listener.EVENT eventType) { for (Enumeration<spider.define.Listener> e = listeners.elements(); e.hasMoreElements();) { spider.define.Listener listener = e.nextElement(); switch (eventType) { case MESSAGE: listener.message(event); break; case ERROR: listener.error(event); break; case ADDURL: listener.addURL(event); break; case GETURL: listener.getURL(event); break; default: listener.clearURL(event); } } } } |
Leave a Reply