手工下载人教网(www.pep.com.cn)的课本实在太麻烦,自己动手写了个批量下载小程序(java版), 一次下载一整本电子书,并且按顺序存放。 代码需要 jsoup-1.7.2.jar 库支持, 命令行运行. cmd执行 java -jar downpep.jar “要下载的人教网电子书首页地址” “下载目录”. 运行完成后在指定目录可找到按电子书的目录顺序保存的内容图片
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* * To change this template, choose Tools | Templates * and open the template in the editor. */ package downpep; import java.io.*; import java.net.*; import javax.xml.parsers.*; import org.w3c.dom.*; import org.xml.sax.SAXException; import org.jsoup.Jsoup; import org.jsoup.select.*; /** * * @author ShiZhuolin */ public class Downpep { /** * @param args the command line arguments */ public static void main(String[] args) throws Exception { URI uri = URI.create(args[0].trim()); Downpep pep = new Downpep(); pep.down(uri, args[1].trim()); } public void down(URI uri, String path) throws Exception { URI[] uris = getLinks(uri); System.out.println("down:"); for (int i = 0, length = uris.length; i < length; i++) { try { URI img = getIMG(uris[i]); String filename = gFilename(img, i); //if (img != null) { downimg(img, path + filename); //} System.out.println(i + "/" + length + " ok"); } catch (Exception e) { System.out.println(i + "/" + length + " " + e.getMessage()); } } System.out.println("ok"); } public String gFilename(URI uri, int i) { String filename = i + uri.toString().substring(uri.toString().lastIndexOf('.')); while (filename.length() < 8) { filename = "0" + filename; } return filename; } public void downimg(URI uri, String filename) throws Exception { URLConnection connect = uri.toURL().openConnection(); BufferedOutputStream bout; try (java.io.BufferedInputStream bin = new BufferedInputStream(connect.getInputStream())) { bout = new BufferedOutputStream(new FileOutputStream(new File(filename))); byte[] b = new byte[10]; while ((bin.read(b)) != -1) { bout.write(b); } bout.flush(); bin.close(); } bout.close(); } /** * 获取指定页面的图片地址 * * @param uri * @return */ public URI getIMG(URI uri) throws Exception { Elements elements; org.jsoup.nodes.Document doc; doc = Jsoup.connect(uri.toString()).get(); elements = doc.select("#doccontent IMG"); for (org.jsoup.nodes.Element elem : elements) { return uri.resolve(elem.attr("src")); } return null; } /** * 获取所有的html链接 * * @param uri * @return * @throws ParserConfigurationException * @throws SAXException * @throws IOException */ public URI[] getLinks(URI uri) throws ParserConfigurationException, SAXException, IOException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(uri.toString()); NodeList nodes = document.getElementsByTagName("L"); int length = nodes.getLength(); URI[] uris = new URI[length]; for (int i = 0; i < length; i++) { uris[i] = uri.resolve(nodes.item(i).getTextContent()); } return uris; } } |
Leave a Reply