JAVA写的蜘蛛程序(源码)

多线程.没办法,貌似多数程序员都写过这玩意
java蜘蛛爬行

全部源码

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package spider.spider;

import java.net.URI;
import java.util.Vector;
import java.util.Enumeration;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import spider.define.Event;
import spider.define.Listener.EVENT;

/**
 *
 * @author 石卓林
 */
public class Spider implements spider.define.Spider {

    protected ThreadGroup group;
    protected SpiderThread[] thread;
    protected Vector urls = new Vector();
    protected int index = 0;
    protected Vector listeners = new Vector();
    protected String domains;

    public void setDomain(String domain) {
        domains = domain;
    }

    public Spider() {
        group = new ThreadGroup("spider-group");
    }

    public int addURL(String url) {
        //验证域
        try {
            URI u = new URI(url);
            boolean bool = false;
            String[] arrdomain = domains.split(",");
            for (int i = 0, length = arrdomain.length; i < length; i++) {
                if (arrdomain[i].trim().equals("*")) {
                    bool = true;
                    break;
                }
                if (u.getHost().endsWith(arrdomain[i].trim())) {
                    bool = true;
                    break;
                }
            }
            if (!bool) {
                return urls.size();
            }
        } catch (java.net.URISyntaxException e) {
            e.printStackTrace();
            return urls.size();
        }
        if (!urls.contains(url)) {
            urls.add(url);
        }
        notifyListeners(new spider.define.Event(this, url), EVENT.ADDURL);
        return urls.size();
    }


    public String getURL() {
        if (index >= urls.size()) {
            spider.define.Event event = new Event(this, null);
            notifyListeners(event, EVENT.GETURL);
            return null;
        }
        String retul = urls.get(index);
        index++;
        spider.define.Event event = new Event(this, retul);
        notifyListeners(event, EVENT.GETURL);
        return retul;
    }

    public void clearURL() {
        index = 0;
        urls.clear();
        spider.define.Event event = new Event(this, null);
        notifyListeners(event, EVENT.CLEARURL);
    }

    public void addMessage(spider.define.WebPage webPage, String url) {
        //分析A标签获取链接地址
        String content = webPage.getContent();

        Pattern pattern = Pattern.compile("href\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
        Matcher matcher = pattern.matcher(content);
        try {
            URI uri = new URI(url);
            while (matcher.find()) {
                String link = matcher.group(1);
                if (link != null) {
                    link = link.replace("&", "&").replace(" ", "").trim();
                    if (!link.startsWith("#") && !link.startsWith("javascript:") && !link.contains("'")) {
                        URI ulink = uri.resolve(link);
                        this.addURL(ulink.toString());
                    }
                }
            }
            pattern = Pattern.compile("URL\\s*=\\s*(?:([^\"'>\\s]+))", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
            matcher = pattern.matcher(content);
            while (matcher.find()) {
                String link = matcher.group(1);
                if (link != null) {
                    link = link.replace("&", "&").trim();
                    if (!link.startsWith("#") && !link.startsWith("(")) {
                        System.out.println(link);
                        URI ulink = uri.resolve(link);
                        this.addURL(ulink.toString());
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            //notifyListeners(new Event(e, url), EVENT.ERROR);
        }
        notifyListeners(new Event(webPage, url), EVENT.MESSAGE);
    }


    public void addError(Exception e, String url) {
        spider.define.Event event = new Event(e, url);
        notifyListeners(event, EVENT.ERROR);
    }


    public void start(int count) {
        thread = new SpiderThread[count];
        for (int i = 0, length = thread.length; i < length; i++) {
            thread[i] = new SpiderThread(group, String.valueOf(i), this);
            thread[i].isRun = true;
            thread[i].start();
        }
    }


    public void stop() {
        for (int i = 0, length = thread.length; i < length; i++) {
            thread[i].isRun = false;
        }
    }


    public void addListener(spider.define.Listener listener) {
        listeners.add(listener);
    }


    public void removeListener(spider.define.Listener listener) {
        listeners.remove(listener);
    }


    public void notifyListeners(Event event, spider.define.Listener.EVENT eventType) {
        for (Enumeration e = listeners.elements(); e.hasMoreElements();) {
            spider.define.Listener listener = e.nextElement();
            switch (eventType) {
                case MESSAGE:
                    listener.message(event);
                    break;
                case ERROR:
                    listener.error(event);
                    break;
                case ADDURL:
                    listener.addURL(event);
                    break;
                case GETURL:
                    listener.getURL(event);
                    break;
                default:
                    listener.clearURL(event);
            }
        }
    }
}

Leave a Reply

Your email address will not be published. Required fields are marked *

Time limit is exhausted. Please reload the CAPTCHA.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com