记得这个是去年的东西了,今天重新拿出来重温,一些知识都模糊了很多。
一共六个类文件加上一个jar包,Demo文件是主文件;DownloadFile文件的作用是从网络URL上下载文件下来,别人已经封装好了拿来用;DownloadThread文件作用是多线程爬取文件下来,速度快;HttpUtils文件作用是将URL网页装换为可操作的document文件,也是别人已经封装好的;MD5不用我说了吧;Task是处理文件的类;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Demo {
/**
* @param args
*/
public static ArrayList<Task> arr = new ArrayList<Task>();
public static void main(String[] args) {
GeiALLimgUrl("http://www.csdn.net"); // 封装目标url
int maxindex = 2; // 设置的多线程个数,修改多少个随你
DownloadThread[] d = new DownloadThread[maxindex];
for (int i = 0; i < maxindex; i++) {
d[i] = new DownloadThread(i);
d[i].start();
}
}
public static void GeiALLimgUrl(String url) {
try {
String result = HttpUtils.doGet(url);
Document doc = Jsoup.parse(result);
Elements links = doc.select("img");
for (Element imgs : links) {
System.out.println(imgs.attr("src")); // 抓取的当前URL页面上的图片img
arr.add(new Task(imgs.attr("src"))); // 先存放在集合里,后续再操作
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static Task getTask() {
for (Task s : arr) {
if (!s.hasDownloaded) {
s.hasDownloaded = true;
return s;
}
}
return null;
}
}
public class Task {
//图片地址
public String imageUrl="";
//图片是否被下载了?
public boolean hasDownloaded=false;
//图片的名字
public String filename;
//构造函数,提供图片的URL就可以了
public Task(String url){
imageUrl=url;
filename=MD5.string2MD5(url); //对图片加密,利于爬取的各种操作
int last=imageUrl.lastIndexOf(".");
String ext=imageUrl.substring(last+1);
filename=filename +"."+ext;
System.out.println("文件名:"+filename);
}
}
import java.io.IOException;
public class DownloadThread extends Thread{
//当前ID号
public int ID;
public boolean exit=false;
public DownloadThread(int id){
ID=id;
}
@Override
public void run() {
// TODO Auto-generated method stub
//super.run();
DownloadFile download=new DownloadFile();
while(!exit){
//从任务列表中读取一个没有被下载的任务
Task target=Demo.getTask();
if(target!=null){
//下载
System.out.println(ID);
try {
download.downLoadFromUrl(target.imageUrl, target.filename, "c:\\images");
} catch (IOException e) {
e.printStackTrace();
}
}
else{
System.out.println("我是第"+ID+"个线程,我现在没有任务");
//没有任务,休息一下
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class DownloadFile {
/**
* 从网络Url中下载文件
* @param urlStr
* @param fileName
* @param savePath
* @throws IOException
*/
public void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
//设置超时间为3秒
conn.setConnectTimeout(3*1000);
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//得到输入流
InputStream inputStream = conn.getInputStream();
//获取自己数组
byte[] getData = readInputStream(inputStream);
//文件保存位置
File saveDir = new File(savePath);
if(!saveDir.exists()){
saveDir.mkdir();
}
File file = new File(saveDir+File.separator+fileName);
if(file.exists())
{
System.out.println("文件已存在,不用重复下载");
return;
}
FileOutputStream fos = new FileOutputStream(file);
fos.write(getData);
if(fos!=null){
fos.close();
}
if(inputStream!=null){
inputStream.close();
}
System.out.println("info:"+url+" download success");
}
/**
* 从输入流中获取字节数组
* @param inputStream
* @return
* @throws IOException
*/
public byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}
}
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;
public class HttpUtils {
//根据url访问服务器,返回服务器响应文本
public static String doGet(String url) throws Exception
{
//创建一个URL对象,URL
URL localURL = new URL(url);
//设置代理服务器
System.setProperty("http.proxyHost", "127.0.0.1");
System.setProperty("http.proxyPort", "8888");
URLConnection connection = localURL.openConnection();
HttpURLConnection httpURLConnection = (HttpURLConnection)connection;
//设置请求头部的属性
httpURLConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E");
//保存输入输出流的对象
InputStream inputStream = null;
InputStreamReader inputStreamReader = null;
BufferedReader reader = null;
StringBuffer resultBuffer = new StringBuffer();
String tempLine = null;
//302强制浏览器跳转,200 ok
if (httpURLConnection.getResponseCode() >= 300) {
throw new Exception("HTTP Request is not success, Response code is " + httpURLConnection.getResponseCode());
}
try {
inputStream = httpURLConnection.getInputStream();
//get header by 'key'
String encoding = httpURLConnection.getHeaderField("Content-Encoding");
//如果返回的是压缩HTML代码
if(encoding!=null && encoding.equals("gzip"))
{
System.out.println("这是一个压缩的HTML\n");
GZIPInputStream gzin;
gzin = new GZIPInputStream(inputStream);
//对返回页面内容进行utf-8解码,从而中文不会乱码
inputStreamReader = new InputStreamReader(gzin,"gbk");
}
else
{
inputStreamReader = new InputStreamReader(inputStream,"gbk");
}
reader = new BufferedReader(inputStreamReader);
while ((tempLine = reader.readLine()) != null) {
resultBuffer.append(tempLine+"\n");
}
} finally {
if (reader != null) {
reader.close();
}
if (inputStreamReader != null) {
inputStreamReader.close();
}
if (inputStream != null) {
inputStream.close();
}
}
return resultBuffer.toString();
}
/*
* currentBase当前搜索网页的URL
* target是从网页标签提取出来的URL(例如href等)
* */
public static String getURL(String currentUrl,String targetUrl)
{
String temp=targetUrl;
//当前页面的路径
//例如:http://www.gdmec.cn/cs/csnew/index.html
//应该要分析出:http://www.gdmec.cn/cs/csnew/
String currentBase="";
String resultURL="";
if(currentUrl.endsWith("/"))
{
currentBase=currentUrl;
}
else
{
int lastPos=currentUrl.lastIndexOf("/");
currentBase=currentUrl.substring(0,lastPos+1);
}
System.out.println("currentBase:"+currentBase);
if(temp.startsWith("http"))
{
return resultURL;
}
else if(temp.startsWith("../"))
{
//resultURL=currentBase+temp.substring(2);
}
else if(temp.startsWith("./"))
{
resultURL=currentBase+temp.substring(2);
}
else if(temp.startsWith("//"))
{
resultURL="http:"+temp;
}
else if(temp.startsWith("/"))
{
resultURL=currentBase+temp.substring(1);
}
else
{
resultURL=currentBase+temp;
}
return resultURL;
}
}
import java.security.MessageDigest;
public class MD5 {
/***
* MD5加码 生成32位md5码
*/
public static String string2MD5(String inStr){
MessageDigest md5 = null;
try{
md5 = MessageDigest.getInstance("MD5");
}catch (Exception e){
System.out.println(e.toString());
e.printStackTrace();
return "";
}
char[] charArray = inStr.toCharArray();
byte[] byteArray = new byte[charArray.length];
for (int i = 0; i < charArray.length; i++)
byteArray[i] = (byte) charArray[i];
byte[] md5Bytes = md5.digest(byteArray);
StringBuffer hexValue = new StringBuffer();
for (int i = 0; i < md5Bytes.length; i++){
int val = ((int) md5Bytes[i]) & 0xff;
if (val < 16)
hexValue.append("0");
hexValue.append(Integer.toHexString(val));
}
return hexValue.toString();
}
}
这里是爬取网络上指定url的图片,其他的比如爬取兼职信息,天气信息等也可以,当然,爬取过多随时会被墙掉,而且一些网页会使用get 或者post来获取信息,这时就要适当修改爬取的方式了,还有一些网页是异步加载,就留给你们自己尝试了。