手工下载人教网(www.pep.com.cn)的课本实在太麻烦,自己动手写了个批量下载小程序(java版), 一次下载一整本电子书,并且按顺序存放。 代码需要 jsoup-1.7.2.jar 库支持, 命令行运行. cmd执行 java -jar downpep.jar “要下载的人教网电子书首页地址” “下载目录”. 运行完成后在指定目录可找到按电子书的目录顺序保存的内容图片
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package downpep;
import java.io.*;
import java.net.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import org.jsoup.Jsoup;
import org.jsoup.select.*;
/**
*
* @author ShiZhuolin
*/
public class Downpep {
/**
* @param args the command line arguments
*/
public static void main(String[] args)
throws Exception {
URI uri = URI.create(args[0].trim());
Downpep pep = new Downpep();
pep.down(uri, args[1].trim());
}
public void down(URI uri, String path) throws Exception {
URI[] uris = getLinks(uri);
System.out.println("down:");
for (int i = 0, length = uris.length; i < length; i++) {
try {
URI img = getIMG(uris[i]);
String filename = gFilename(img, i);
//if (img != null) {
downimg(img, path + filename);
//}
System.out.println(i + "/" + length + " ok");
} catch (Exception e) {
System.out.println(i + "/" + length + " " + e.getMessage());
}
}
System.out.println("ok");
}
public String gFilename(URI uri, int i) {
String filename = i + uri.toString().substring(uri.toString().lastIndexOf('.'));
while (filename.length() < 8) {
filename = "0" + filename;
}
return filename;
}
public void downimg(URI uri, String filename) throws Exception {
URLConnection connect = uri.toURL().openConnection();
BufferedOutputStream bout;
try (java.io.BufferedInputStream bin = new BufferedInputStream(connect.getInputStream())) {
bout = new BufferedOutputStream(new FileOutputStream(new File(filename)));
byte[] b = new byte[10];
while ((bin.read(b)) != -1) {
bout.write(b);
}
bout.flush();
bin.close();
}
bout.close();
}
/**
* 获取指定页面的图片地址
*
* @param uri
* @return
*/
public URI getIMG(URI uri) throws Exception {
Elements elements;
org.jsoup.nodes.Document doc;
doc = Jsoup.connect(uri.toString()).get();
elements = doc.select("#doccontent IMG");
for (org.jsoup.nodes.Element elem : elements) {
return uri.resolve(elem.attr("src"));
}
return null;
}
/**
* 获取所有的html链接
*
* @param uri
* @return
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
*/
public URI[] getLinks(URI uri)
throws ParserConfigurationException, SAXException, IOException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
org.w3c.dom.Document document = builder.parse(uri.toString());
NodeList nodes = document.getElementsByTagName("L");
int length = nodes.getLength();
URI[] uris = new URI[length];
for (int i = 0; i < length; i++) {
uris[i] = uri.resolve(nodes.item(i).getTextContent());
}
return uris;
}
}
Leave a Reply