java 网络爬虫之多线程抓取文件

穆劲
2023-12-01

记得这个是去年的东西了,今天重新拿出来重温,一些知识都模糊了很多。

一共六个类文件加上一个jar包,Demo文件是主文件;DownloadFile文件的作用是从网络URL上下载文件下来,别人已经封装好了拿来用;DownloadThread文件作用是多线程爬取文件下来,速度快;HttpUtils文件作用是将URL网页装换为可操作的document文件,也是别人已经封装好的;MD5不用我说了吧;Task是处理文件的类;

1 Demo.java

import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Demo {

	/**
	 * @param args
	 */

	public static ArrayList<Task> arr = new ArrayList<Task>();

	public static void main(String[] args) {

		GeiALLimgUrl("http://www.csdn.net"); // 封装目标url
		
		int maxindex = 2; // 设置的多线程个数,修改多少个随你
		
		DownloadThread[] d = new DownloadThread[maxindex];
		
		for (int i = 0; i < maxindex; i++) {
			
			d[i] = new DownloadThread(i);
			
			d[i].start();
		
		}
	}

	public static void GeiALLimgUrl(String url) {
		try {
			String result = HttpUtils.doGet(url);
			
			Document doc = Jsoup.parse(result);
			
			Elements links = doc.select("img");
			
			for (Element imgs : links) {
				
				System.out.println(imgs.attr("src")); // 抓取的当前URL页面上的图片img
				
				arr.add(new Task(imgs.attr("src"))); // 先存放在集合里,后续再操作
			}
		} catch (Exception e) {

			e.printStackTrace();
		}
	}

	public static Task getTask() {
		for (Task s : arr) {
			if (!s.hasDownloaded) {
				s.hasDownloaded = true;
				return s;
			}
		}
		return null;
	}
}

2 Task.java

public class Task {

	//图片地址
	public String imageUrl="";
	
	//图片是否被下载了?
	public  boolean hasDownloaded=false;
	
	//图片的名字
	public String filename;
	
	//构造函数,提供图片的URL就可以了
	public Task(String url){
	
		imageUrl=url;
		
		filename=MD5.string2MD5(url);  //对图片加密,利于爬取的各种操作
		
		int last=imageUrl.lastIndexOf(".");
		
		String ext=imageUrl.substring(last+1);
				
		filename=filename +"."+ext;
		
		System.out.println("文件名:"+filename);
	}
	
}

3 DownloadThread.java


import java.io.IOException;

public class DownloadThread extends Thread{

	//当前ID号 
	public int ID;
	
	public boolean exit=false;
	
	public DownloadThread(int id){
	
		ID=id;
	}
	
	@Override
	public void run() {
		// TODO Auto-generated method stub
		//super.run();
		
		DownloadFile download=new DownloadFile();
		
		while(!exit){
		
			
			//从任务列表中读取一个没有被下载的任务
			Task target=Demo.getTask();
			
			if(target!=null){
			
				//下载
				System.out.println(ID);
				try {
					
					download.downLoadFromUrl(target.imageUrl, target.filename, "c:\\images");
				    
				} catch (IOException e) {
					
					e.printStackTrace();
				}
				
			}
			else{
			
				System.out.println("我是第"+ID+"个线程,我现在没有任务");
				
				//没有任务,休息一下
				try {
					Thread.sleep(1000);
				} catch (InterruptedException e) {
					
					e.printStackTrace();
				}
			}
			
		}
		
	}
	
	
	

}

4 DownloadFile.java


import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class DownloadFile {

	/**
	 * 从网络Url中下载文件
	 * @param urlStr
	 * @param fileName
	 * @param savePath
	 * @throws IOException
	 */
	public  void  downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
		URL url = new URL(urlStr);  
		HttpURLConnection conn = (HttpURLConnection)url.openConnection();  
                //设置超时间为3秒
		conn.setConnectTimeout(3*1000);
		//防止屏蔽程序抓取而返回403错误
		conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

		//得到输入流
		InputStream inputStream = conn.getInputStream();  
		//获取自己数组
		byte[] getData = readInputStream(inputStream);    

		//文件保存位置
		File saveDir = new File(savePath);
		if(!saveDir.exists()){
			saveDir.mkdir();
		}
		
		File file = new File(saveDir+File.separator+fileName); 
		
		if(file.exists())
		{
			System.out.println("文件已存在,不用重复下载");
			return;
		}
		
		FileOutputStream fos = new FileOutputStream(file);     
		fos.write(getData); 
		if(fos!=null){
			fos.close();  
		}
		if(inputStream!=null){
			inputStream.close();
		}


		System.out.println("info:"+url+" download success"); 

	}



	/**
	 * 从输入流中获取字节数组
	 * @param inputStream
	 * @return
	 * @throws IOException
	 */
	public   byte[] readInputStream(InputStream inputStream) throws IOException {  
		byte[] buffer = new byte[1024];  
		int len = 0;  
		ByteArrayOutputStream bos = new ByteArrayOutputStream();  
		while((len = inputStream.read(buffer)) != -1) {  
			bos.write(buffer, 0, len);  
		}  
		bos.close();  
		return bos.toByteArray();  
	}  

	
}

5 HttpUtils.java


import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;


public class HttpUtils {

	//根据url访问服务器,返回服务器响应文本
	public static String doGet(String url) throws Exception 
	{ 
		    //创建一个URL对象,URL
		    URL localURL = new URL(url);
	      
		    //设置代理服务器
		    System.setProperty("http.proxyHost", "127.0.0.1");  
	        System.setProperty("http.proxyPort", "8888"); 
		    
	        URLConnection connection = localURL.openConnection();
	        HttpURLConnection httpURLConnection = (HttpURLConnection)connection;
   
	        //设置请求头部的属性
	        httpURLConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E");
	        
	        
	        //保存输入输出流的对象
	        InputStream inputStream = null;
	        InputStreamReader inputStreamReader = null;
	        BufferedReader reader = null;
	        StringBuffer resultBuffer = new StringBuffer();
	        String tempLine = null;
	        
	        //302强制浏览器跳转,200 ok
	        if (httpURLConnection.getResponseCode() >= 300) {
	            throw new Exception("HTTP Request is not success, Response code is " + httpURLConnection.getResponseCode());
	        }
	        
	        try {
	            inputStream = httpURLConnection.getInputStream();
	      
	        
	        	//get header by 'key'
	        	String encoding = httpURLConnection.getHeaderField("Content-Encoding");
	            
	        	//如果返回的是压缩HTML代码
	        	if(encoding!=null && encoding.equals("gzip"))
	        	{
	        	 System.out.println("这是一个压缩的HTML\n");
	             GZIPInputStream gzin;  
	             gzin = new GZIPInputStream(inputStream); 
	             //对返回页面内容进行utf-8解码,从而中文不会乱码
	             inputStreamReader = new InputStreamReader(gzin,"gbk");
	            
	        	}
	        	else
	        	{
	        	   inputStreamReader = new InputStreamReader(inputStream,"gbk");
	        	}
	            reader = new BufferedReader(inputStreamReader);
	            
	            while ((tempLine = reader.readLine()) != null) {
	                resultBuffer.append(tempLine+"\n");
	            }
	            
	        } finally {
	            
	            if (reader != null) {
	                reader.close();
	            }
	            
	            if (inputStreamReader != null) {
	                inputStreamReader.close();
	            }
	            
	            if (inputStream != null) {
	                inputStream.close();
	            }
	            
	        }
	        
	        return resultBuffer.toString();
	}
	
	/*
	 * currentBase当前搜索网页的URL
	 * target是从网页标签提取出来的URL(例如href等)
	 * */
	public static String getURL(String currentUrl,String targetUrl)
	{
		String temp=targetUrl;
		//当前页面的路径
		//例如:http://www.gdmec.cn/cs/csnew/index.html
		//应该要分析出:http://www.gdmec.cn/cs/csnew/
		String currentBase="";
		
		String resultURL="";
		
	
		if(currentUrl.endsWith("/"))
		{
			currentBase=currentUrl;
		}
		else
		{
			int lastPos=currentUrl.lastIndexOf("/");
			currentBase=currentUrl.substring(0,lastPos+1);
		}
		
		System.out.println("currentBase:"+currentBase);
		
		if(temp.startsWith("http"))
		{
			return resultURL;
		}
		else if(temp.startsWith("../"))
		{
			//resultURL=currentBase+temp.substring(2);
			
			
		}
		else if(temp.startsWith("./"))
		{
			resultURL=currentBase+temp.substring(2);
		}
		else if(temp.startsWith("//"))
		{
			resultURL="http:"+temp;
		}
		else if(temp.startsWith("/"))
		{
			resultURL=currentBase+temp.substring(1);
		}
		else
		{
			resultURL=currentBase+temp;
		}
		return resultURL;
		
	}
}

6 MD5.java


import java.security.MessageDigest;

public class MD5 {

	/*** 
	    * MD5加码 生成32位md5码 
	    */  
	   public static String string2MD5(String inStr){  
	       MessageDigest md5 = null;  
	       try{  
	           md5 = MessageDigest.getInstance("MD5");  
	       }catch (Exception e){  
	           System.out.println(e.toString());  
	           e.printStackTrace();  
	           return "";  
	       }  
	       char[] charArray = inStr.toCharArray();  
	       byte[] byteArray = new byte[charArray.length];  
	 
	       for (int i = 0; i < charArray.length; i++)  
	           byteArray[i] = (byte) charArray[i];  
	       byte[] md5Bytes = md5.digest(byteArray);  
	       StringBuffer hexValue = new StringBuffer();  
	       for (int i = 0; i < md5Bytes.length; i++){  
	           int val = ((int) md5Bytes[i]) & 0xff;  
	           if (val < 16)  
	               hexValue.append("0");  
	           hexValue.append(Integer.toHexString(val));  
	       }  
	       return hexValue.toString();  
	 
	   }  
	
}

 jar包 jsoup-1.9.2.jar

这里是爬取网络上指定url的图片,其他的比如爬取兼职信息,天气信息等也可以,当然,爬取过多随时会被墙掉,而且一些网页会使用get 或者post来获取信息,这时就要适当修改爬取的方式了,还有一些网页是异步加载,就留给你们自己尝试了。


 类似资料: