用Java批量下载人教网电子课本和教师用书

手工下载人教网(www.pep.com.cn)的课本实在太麻烦,自己动手写了个批量下载小程序(java版), 一次下载一整本电子书,并且按顺序存放。 代码需要 jsoup-1.7.2.jar 库支持, 命令行运行. cmd执行 java -jar downpep.jar “要下载的人教网电子书首页地址” “下载目录”. 运行完成后在指定目录可找到按电子书的目录顺序保存的内容图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package downpep;
 
import java.io.*;
import java.net.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import org.jsoup.Jsoup;
import org.jsoup.select.*;
 
/**
 *
 * @author ShiZhuolin
 */
public class Downpep {
 
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args)
            throws Exception {
        URI uri = URI.create(args[0].trim());
        Downpep pep = new Downpep();
        pep.down(uri, args[1].trim());
    }
 
    public void down(URI uri, String path) throws Exception {
        URI[] uris = getLinks(uri);
        System.out.println("down:");
        for (int i = 0, length = uris.length; i < length; i++) {
            try {
                URI img = getIMG(uris[i]);
                String filename = gFilename(img, i);
                //if (img != null) {
                downimg(img, path + filename);
                //}
                System.out.println(i + "/" + length + " ok");
            } catch (Exception e) {
                System.out.println(i + "/" + length + " " + e.getMessage());
            }
        }
        System.out.println("ok");
    }
 
    public String gFilename(URI uri, int i) {
        String filename = i + uri.toString().substring(uri.toString().lastIndexOf('.'));
        while (filename.length() < 8) {
            filename = "0" + filename;
        }
        return filename;
    }
 
    public void downimg(URI uri, String filename) throws Exception {
        URLConnection connect = uri.toURL().openConnection();
        BufferedOutputStream bout;
        try (java.io.BufferedInputStream bin = new BufferedInputStream(connect.getInputStream())) {
            bout = new BufferedOutputStream(new FileOutputStream(new File(filename)));
            byte[] b = new byte[10];
            while ((bin.read(b)) != -1) {
                bout.write(b);
            }
            bout.flush();
            bin.close();
        }
        bout.close();
    }
 
    /**
     * 获取指定页面的图片地址
     *
     * @param uri
     * @return
     */
    public URI getIMG(URI uri) throws Exception {
        Elements elements;
        org.jsoup.nodes.Document doc;
        doc = Jsoup.connect(uri.toString()).get();
        elements = doc.select("#doccontent IMG");
        for (org.jsoup.nodes.Element elem : elements) {
            return uri.resolve(elem.attr("src"));
        }
        return null;
    }
 
    /**
     * 获取所有的html链接
     *
     * @param uri
     * @return
     * @throws ParserConfigurationException
     * @throws SAXException
     * @throws IOException
     */
    public URI[] getLinks(URI uri)
            throws ParserConfigurationException, SAXException, IOException {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        org.w3c.dom.Document document = builder.parse(uri.toString());
        NodeList nodes = document.getElementsByTagName("L");
        int length = nodes.getLength();
        URI[] uris = new URI[length];
        for (int i = 0; i < length; i++) {
            uris[i] = uri.resolve(nodes.item(i).getTextContent());
        }
        return uris;
    }
}

项目文件(netbeans)

Leave a Reply

Your email address will not be published. Required fields are marked *

Time limit is exhausted. Please reload the CAPTCHA.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com