`

Gerry版网页爬虫V0.01(Java语言版本)

阅读更多
import java.io.ByteArrayOutputStream;   
import java.io.File;   
import java.io.FileOutputStream;   
import java.io.IOException;   
import java.io.InputStream;   
import java.net.URL;   
import java.net.URLConnection;   
import java.util.ArrayList;   
import java.util.List;   
import java.util.regex.Matcher;   
import java.util.regex.Pattern;   
  
/**  
 * 只会爬取在该域名域内的链接(二级,三级)  
 * 图片资源可以去取第三方网站的内容  
 * Created:2010-6-22  
 * @author:Gerry  
 * @version:  
 */  
public class WebProcess {   
    
  
    private String urlName ; //链接原型   
    private String urlFileName; //重命名后的文件名   
    private String html ;    //html内容   
      
  
    public String getUrlFileName() {   
        return urlFileName;   
    }   
  
    public void setUrlFileName(String urlFileName) {   
        this.urlFileName = urlFileName;   
    }   
  
    public String getHtml() {   
        return html;   
    }   
  
    public void setHtml(String html) {   
        this.html = html;   
    }   
  
    public String getUrlName() {   
        return urlName;   
    }   
  
    public void setUrlName(String urlName) {   
        this.urlName = urlName;   
    }  
public ByteArrayOutputStream getAddressContext(String str_url) throws Exception {      
      
         URLConnection conn = null;   
         String str_urlhead = "http://" ;   
         str_url = str_urlhead + str_url ;   
         URL url = new URL(str_url);   
         System.setProperty("http.proxyHost", "openproxy.xxxxxx.com");//setting proxy host   
         System.setProperty("http.proxyPort", "8080");//setting proxy host port   
         conn = url.openConnection();   
         if (conn == null){return null;}   
         conn.setRequestProperty("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0");     
         conn.setUseCaches(false);     
           
         InputStream ins = conn.getInputStream();   
           
         ByteArrayOutputStream outputstream = new ByteArrayOutputStream();   
         byte[] str_b = new byte[1024];   
         int i = -1;   
         while ((i=ins.read(str_b)) > 0){   
           outputstream.write(str_b,0,i);   
         }   
         ins.close();   
         outputstream.close();   
         //all_content = outputstream.toString("UTF-8");   
         //return new String(all_content.getBytes("ISO8859-1"));   
         return outputstream;   
    }   
      
    private int writeFolder(String path){   
        File file = new File(path);   
        file.mkdir();   
        return 1 ;   
    }  
/**  
     * 写文件的实现类  
     * content==null 的时候那么写入二进制文件  
     * content不为null 的时候就写入文本文件  
     */  
    private int writeFile(String path,String content,ByteArrayOutputStream outputstream) throws IOException{   
        File f = new File(path);   
        f.createNewFile();   
        FileOutputStream fout = new FileOutputStream(f);     
        if(content==null){   
            fout.write(outputstream.toByteArray());   
        }else{   
            fout.write(content.getBytes("utf-8"));   
        }   
        fout.flush();   
        fout.close();   
        return 1 ;   
    }   
/**  
     * 关于替换资源的几种情况(不保留原始网站目录结构)递归调用  
     * 一级目录 1.js  外部引用文件  
     * 一级目录 2.css 外部引用文件  
     * 一级目录 3.<img src="" width=0 height=0 /> 标签中引用的图片  
     * 一级目录 4.关于css中引用的图片  
     * 二级目录 1.<a href="#"></a> 标签中重写url链接  
     * @param filePath  
     * type = 1 (src  资源文件)  
     * type = 2 (href 二级,其他文件) link href=""  CSS样式表  
     * @param  
     * 重命名规则  
     * 1.http://hi.csdn.net/js/jquery-1.4.2.min.js  
     *   /用 -    符号替换  
     *   http:// 替换为空  
     * 2.重命名后为 http--hi.csdn.net-js-jquery-1.4.2.min.js  
     * @return  
     * @throws Exception  
     * Created:2010-6-22  
     * @author:Gerry  
     */  
private List<WebProcess> parseHtmlContent(String type,String html,List<WebProcess> list) throws Exception{   
       
    if(type==null){type="1";}   
    if(type.equals("1")){   
        String regEx = "src=['\"]http://?([^'\"<>]+)['\"]?";    
        Pattern p = Pattern.compile(regEx,Pattern.CASE_INSENSITIVE);   
        Matcher m = p.matcher(html);   
        if(m.find()){   
            String group1 = m.group(1);   
            if(group1.indexOf("/")>-1){//判断是否为未重写以前的   
                 WebProcess webProcess = new WebProcess();   
                 String tempTr = m.group(0);   
                 tempTr = tempTr.replaceAll("http://", "");   
                 tempTr = tempTr.replaceAll("HTTP://", "");   
                 tempTr = tempTr.replaceAll("/", "-");   
                 String tempTr2 = m.group(1);   
                 tempTr2 = tempTr2.replaceAll("/", "-");   
                 webProcess.setUrlName(m.group(1));//原始http:url链接名字   
                 webProcess.setUrlFileName(tempTr2);//重命名后的名字   
                 list.add(webProcess);   
                 html = m.replaceFirst(tempTr);//修改源文件中的资源链接   
                 return parseHtmlContent("1",html,list);   
            }else{   
               return parseHtmlContent("1",html,list);   
            } }else {   
                WebProcess webProcess = new WebProcess();   
                webProcess.setHtml(html);   
                list.add(webProcess);   
                return list ;   
            }   
        }else if(type.equals("2")){   
               
        }   
  
        return list ;   
    }  
/**  
     * 核心业务方法  
     * @param webname  
     * @return  
     * @throws Exception  
     * Created:2010-6-22  
     * @author:Gerry  
     */  
    private int kernelBusiness(String webname) throws Exception{   
        WebProcess process = new WebProcess();   
        System.out.println("正在读取远程文件:"+webname+"...");   
        String content = process.getAddressContext(webname).toString("UTF-8");   
        System.out.println("远程文件读取完毕!");   
        List<WebProcess> list = process.parseHtmlContent("1",content,new ArrayList<WebProcess>());   
        if(list!=null && list.size()>0){   
            content = list.get(list.size()-1).getHtml();   
        }    
        System.out.println("一共是:"+list.size());   
        //对本地写文件的相关操作   
        System.out.println("正在写入文件index.html到本地...");   
        String localAddress = "F:\\"+webname ;//修改磁盘地址   
        process.writeFolder(localAddress);   
        process.writeFile(localAddress+"\\index.html",content,null);   
        System.out.println("文件index.html保存完毕!");  //对本地写主页中的资源文件   
        for(int i=0;i<list.size();i++){   
            WebProcess webprocess = (WebProcess)list.get(i);   
            String resourceUrl = webprocess.getUrlName();   
            System.out.println("正在读取远程文件:"+resourceUrl+"...");   
            ByteArrayOutputStream outputstreamResource = new ByteArrayOutputStream();   
            try{   
                outputstreamResource = process.getAddressContext(resourceUrl);   
            }catch(Exception ex){   
                System.out.println("远程连接出现异常!!!请求http://"+resourceUrl+"失败!");   
            }   
            System.out.println("远程文件读取完毕!");   
            System.out.println("正在写入文件"+resourceUrl+"到本地...");   
            try{   
               //判断输出的是文本还是二进制文件流   
               if(webprocess.getUrlFileName()!=null){   
                  if(webprocess.getUrlFileName().length()>3){   
                      String temp_urlfilename = webprocess.getUrlFileName();   
                      String temp_lastfilesub = temp_urlfilename.substring(temp_urlfilename.length()-3,temp_urlfilename.length());   
                      temp_lastfilesub = temp_lastfilesub.toUpperCase();   
                      if(temp_lastfilesub.equals(".JS") || temp_lastfilesub.equals("CSS")){   
                          process.writeFile(localAddress+"\\"+webprocess.getUrlFileName(),outputstreamResource.toString("UTF-8"),null);   
                      }else if(temp_lastfilesub.equals("SWF") || temp_lastfilesub.equals("GIF") || temp_lastfilesub.equals("JPG") || temp_lastfilesub.equals("PNG")){   
                          process.writeFile(localAddress+"\\"+webprocess.getUrlFileName(),null,outputstreamResource);   
                      }   
                  }   
               }   
            }catch(Exception ex){   
                System.out.println("文件写入出现异常!!!文件"+webprocess.getUrlFileName()+"失败!");   
            }   
            System.out.println("文件"+resourceUrl+"保存完毕!");   
        }   
        return 1 ;   
    }  
public static void main(String[] args) throws Exception {   
            
         long startTime = System.currentTimeMillis();   
         System.out.println("开始执行时间:=="+startTime);   
         ////////////////////////////////////////////////////   
         String args1 = "www.csdn.net" ;   
         WebProcess process = new WebProcess();   
         process.kernelBusiness(args1);   
         ///////////////////////////////////////////////////   
         long endTime = System.currentTimeMillis();   
         long wasterTime = endTime - startTime;   
         System.out.println("结束执行时间:=="+endTime);   
         System.out.println("一共执行了时间:=="+wasterTime);//6172 ms   
            
    }   
  
}  

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics