经过4天课余时间,Dspider做好了,虽然没有实现很多高级功能,但是对于抓取博客站和新闻站已经足够了,也满足了我的要求。

其主要特性有:

  • ·支持自定义多线程抓取页面
  • ·能够自定义内容存储方式,默认使用json格式保存文件
  • ·数据处理模块和下载器分离

下面贴上源码:
TaskHandler

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
package com.nickzy.dspider;

import java.util.Iterator;
import java.util.concurrent.CountDownLatch;
import java.util.function.Function;


/**
 * @author zynick
 *
 */
public class TaskHandler {
    int threads = FunctionKit.threads;//默认线程数
    Spider spider;
    long start = System.currentTimeMillis(); 
    public TaskHandler(Spider spider){
        this.spider = spider;
        Iterator iterator = spider.getStart_url().iterator();
        while(iterator.hasNext())
        UnreadQuee.addElem((String) iterator.next());
        System.out.println("------------------一个线程开始:"+Thread.currentThread().getName()+"线程");
        System.out.println("------------------检测到"+UnreadQuee.size()+"条url记录");
        //threads = 3;//测试时修改线程数量
        start();
    }

    public void start() {
            CountDownLatch countDownLatch = new CountDownLatch(threads);
            DataSolver ds = new DataSolver();//开启数据处理线程
            ds.start();
            System.out.println("解析器开启成功");
            for(int i=1;i<=threads;i++){
                Downloader downloader = new Downloader(spider,countDownLatch);
                Thread thread = new Thread(downloader,String.valueOf(i));
                System.out.println("------------------创建线程"+i+"线程");
                thread.start();//调用下载器  
                try {
                    Thread.sleep(1000);//多线程开启延时
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } 
            }
            System.out.println("------------------"+threads+"个线程开启完毕");
            


            //DataSolver.dataSolver();//在数据获取完毕后进行解析,方便调试
            try {
                countDownLatch.await();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            System.out.println("任务完成");
            FunctionKit.isDone = true;
            long end = System.currentTimeMillis();  
            System.out.println("执行时长:" + (end - start)); 
            
    }
}

FunctionKit

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
package com.nickzy.dspider;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class FunctionKit {
    public static int depth = 0;
    public static int threads = 1;
    public static boolean thread = false;
    public static int reg_datas = 1;//查找数据正则表达式数量
    public static int reg_urls = 1;
    public static boolean isDone = false;//程序执行结束标记
    
    
    //获取匹配的值
    public static List<String> doRegex(String content,String regex){
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(content);
        String[] data = null;
         List<String> list = new ArrayList<String>();
        while(m.find()){
            String s = m.group(1);
            if(s!=null)//非空判断,有时因为正则式的原因会输出一个null
            list.add(s);
            }   
        return removeDuplicate(list);
    }
    
    //List去重复,使用hashset不保证顺序
     public static List removeDuplicate(List list)   { 
         Set set  =   new  HashSet(); 
         List newList  =   new  ArrayList(); 
         for(Iterator iter  =  list.iterator(); iter.hasNext();) { 
             Object element  =  iter.next(); 
             if  (set.add(element)) 
                newList.add(element); 
         } 
         list.clear(); 
         list.addAll(newList); 
            return list; 
        } 
     
     public static String getHrefOfInOut(String href)  
        {  
            /* 内外部链接最终转化为完整的链接格式 */
            String resultHref = null;  
         
            /* 判断是否为外部链接 */
            if (href.startsWith("http://"))  
            {  
                resultHref = href;  
            } else
            {  
                /* 如果是内部链接,则补充完整的链接地址,其他的格式忽略不处理,如:a href="#" */
                if (href.startsWith("/"))  
                {  
                    resultHref = "http://www.oschina.net" + href;  
                }  
            }  
         
            return resultHref;  
        }  
     
     public static String getHrefOfContent(String content)  
        {  
            System.out.println("解析地址开始");  
            StringBuffer sb = new StringBuffer();
            String[] contents = content.split("<a href=\"");  
            for (int i = 1; i < contents.length; i++)  
            {  
                int endHref = contents[i].indexOf("\"");  
         
                String aHref = getHrefOfInOut(contents[i].substring(  
                        0, endHref));  
               
                if (aHref != null)  
                {  
                    String href = getHrefOfInOut(aHref);  
         
                    if (!UnreadQuee.isContains(href)  
                            //&& href.indexOf("/code/explore") != -1  
                            && !FinishQuee.isContains(href))  
                    {  
                        sb.append(href);
                        sb.append("|");
                    }  
                }  
            }  

            return sb.toString();
         
        }  
}

Downloader

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
package com.nickzy.dspider;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.concurrent.CountDownLatch;

public class Downloader implements  Runnable {
    private Spider spider;
    private CountDownLatch countDownLatch;
    public Downloader(Spider spider, CountDownLatch countDownLatch){
        this.spider = spider;
        this.countDownLatch = countDownLatch;
    }
    @Override
    public void run() {
        String threadid = "线程"+Thread.currentThread().getName()+":";
        System.out.println("------------------进入"+threadid);
        System.out.println(threadid+"检测到"+UnreadQuee.size()+"条url记录");
        while(!UnreadQuee.isEmpty()){
            String url = UnreadQuee.outElem();
            if(!FinishQuee.isContains(url)){
                FinishQuee.addElem(url);
                StringBuffer sb = new StringBuffer(); 
                    try{
                            System.out.println(threadid+"正在下载:"+url);
                            HttpURLConnection huc = (HttpURLConnection)new URL(url).openConnection();
                            huc.setConnectTimeout(10000);
                            huc.setDoOutput(true);
                            huc.setUseCaches(false);
                            BufferedReader reader = new BufferedReader(new InputStreamReader(huc.getInputStream()));
                            String line;
                            while((line = reader.readLine()) != null){
                                sb.append(line);
                                sb.append("\n");
                                }
                            reader.close();

                            System.out.println(threadid+"下载完毕");
                            String content = sb.toString();
                            String links = FunctionKit.getHrefOfContent(content);//存储符合要求的链接地址
                            DataParser.urlParser(content, spider);//匹配网页内的链接地址
                            DataParser.dataParser(content,spider);//匹配网页内容
                            System.out.println(threadid+"待处理的链接数"+UnreadQuee.size());  
                            System.out.println(threadid+"已处理的页面数"+FinishQuee.size()); 
                            System.out.println(threadid+"解析到的数据数"+PipelineQuee.size());
                            
                    }catch (Exception e){
                            System.err.println(threadid+"下载失败");
                            e.printStackTrace();
                    }
            }
        }
        System.out.println(threadid+"成功退出");
        countDownLatch.countDown();
    }
}

DataParser

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
package com.nickzy.dspider;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class DataParser {
    public static boolean dataParser(String content,Spider spider){
        int data_s=1;//匹配值数目的临时变量
        System.out.println("匹配数据开始...");
        boolean flag=false;
        Map<Integer,List> map= new HashMap<Integer,List>();
        List<String> datas = new ArrayList<String>(spider.getRegex_data());//得到匹配内容的正则列表
        Iterator<String> iterator = datas.iterator();
        while(iterator.hasNext()){
            List<String> l= new ArrayList<String>();
            l = FunctionKit.doRegex(content, iterator.next());//获取匹配列表
            map.put(data_s,l);
            data_s ++;
        }
            if(!map.isEmpty()){
                flag = true;
                PipelineQuee.addElem(map);
                System.out.println("匹配数据完毕...");
            }else
                System.out.println("没有获得匹配文本");     
        
        return flag;
    }
    
    public static boolean urlParser(String content,Spider spider){
        System.out.println("匹配网页地址开始...");
        boolean flag = false;
        List<String> urls = new ArrayList<String>(spider.getRegex_url());
        Iterator<String> iterator = urls.iterator();
        while(iterator.hasNext()){
            List<String> links = new ArrayList<String>();
            links = FunctionKit.doRegex(content,(String)iterator.next());
            if(!links.isEmpty()){
                for(String s:links) 
                    UnreadQuee.addElem(s);
                flag = true;
            }
        }
        return flag;    
    }
}

DataSolver

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package com.nickzy.dspider;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;

import org.json.JSONArray;
import org.json.JSONObject;

public class DataSolver extends Thread{
    String datasaved;
    JSONObject jsonObject = new JSONObject();
    JSONArray jsonArray = new JSONArray();
    @Override
    public void run() {
        while(!FunctionKit.isDone){
            if(!PipelineQuee.isEmpty()){
                Map<Integer,List> m = PipelineQuee.outElem();
                for(int i=1;i<=FunctionKit.reg_datas;i++){
                    System.out.print("第"+i+"个匹配:");
                    List<String> l = new ArrayList<String>(m.get(i));
                    jsonArray.put(l);
                    jsonObject.put(String.valueOf(i),jsonArray);
                    for(String s:l){
                        System.out.print(s+"|");
                    }
                    System.out.println("\n");

                }
                System.out.println("--------------------------");
            }
        }
        datasaved=jsonObject.toString();
//      try {
//          writeToFile();//测试时没有选择写入文件
//      } catch (IOException e) {
//          e.printStackTrace();
//      }
    }
    //写入数据到文件
    private void writeToFile()throws IOException{
        File file = new File("E:\\data.json");
        if(!file.exists())
            file.createNewFile();
        FileWriter writer = new FileWriter(file);
        writer.write(datasaved);
        writer.flush();
        writer.close();
    }

    //此方法设置静态为检查输出测试所用
    public static boolean dataSolver(){
        System.err.println(FunctionKit.reg_datas);
        while (!PipelineQuee.isEmpty()){
            Map<Integer,List> m = PipelineQuee.outElem();
            for(int i=1;i<=FunctionKit.reg_datas;i++){
                System.out.print("第"+i+"个匹配:");
                List<String> l = new ArrayList<String>(m.get(i));
                for(String s:l)
                    System.out.print(s+"|");
                System.out.println("\n");
            }
            System.out.println("\n--------------------------");
        }
    return true;
    }
}