본문 바로가기
Back-end/JAVA

[JAVA] Quartz 스케줄러 만들기 (5) - Crawler Job

by 허도치 2020. 11. 12.
서론

  이번 포스트에서는 드디어 스케줄러에 Crawler를 적용해보려고 한다. 기존 예제들은 주로 스케줄러를 설정했었다면, 이번 시간에는 실제 스케줄에 맞춰서 실제 실행되는 작업을 구현할 계획이다. 예제니까 간단하게 현재 블로그의 전체 글목록을 조회하는 Crawler를 만들 계획이며, Jsoup 라이브러리를 이용하려고 한다.

 

 

 

개발환경

    - jdk-11.0.5
    - quartz-2.3.2

    - jsoup-1.11.3

<!-- Scheduler -->
<dependency>
  <groupId>org.quartz-scheduler</groupId>
  <artifactId>quartz</artifactId>
  <version>2.3.2</version>
</dependency>
<!-- Scheduler -->

<!-- Crawler -->
<dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.11.3</version>
</dependency>
<!-- Crawler -->

<!-- Logging -->
<dependency>
  <groupId>org.slf4j</groupId>
  <artifactId>slf4j-api</artifactId>
  <version>1.7.25</version>
</dependency>
<dependency>
  <groupId>ch.qos.logback</groupId>
  <artifactId>logback-classic</artifactId>
  <version>1.2.3</version>
</dependency>
<!-- Logging -->

 

 

 

소스코드
1. CrawlerJob.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
package com.dochi.quartz.crawl;
 
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.InterruptableJob;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.UnableToInterruptJobException;
 
import com.dochi.quartz.step.JobLauncher;
 
 
public class CrawlerJob implements InterruptableJob {
 
    private Thread currentThread = null;
    private final String URL = "https://heodolf.tistory.com/";
 
    @Override
    public void execute(JobExecutionContext context) throws JobExecutionException {
        // 현재 Thread 저장
        this.currentThread = Thread.currentThread();
 
        System.out.println(String.format("[%s][%s][%s] START"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , context.getJobDetail().getKey().getName()));
 
        // 페이지 정보 수집
        int startPage = 1;
        int endPage = getEndPage();
 
        // 페이지별로 글목록 수집
        List<Map<StringString>> postList = new ArrayList<Map<StringString>>();
        for(int page=endPage; page>=startPage; page--) {
            postList.addAll(getAritcleList(URL+"?page="+page));
        }
 
        System.out.println(String.format("[%s][%s] - 조회 완료 ( %d건 )"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , postList.size()));
        System.out.println(String.format("[%s][%s][%s] END"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , context.getJobDetail().getKey().getName()));
    }
 
    @Override
    public void interrupt() throws UnableToInterruptJobException {
        // interrupt 설정
        //   - 강제종료
        ifthis.currentThread != null ) {
            this.currentThread.interrupt();
        }
    }
 
    // 마지막 페이지를 가지고 오는 함수
    private int getEndPage() {
 
        int lastPage = 1;
 
        try {
            // 크롤링
            Document document = Jsoup.connect(URL).get();
 
            // 마지막 페이지를 가진 Element 탐색
            Element element = document.select(".pagination > a[href]:not([class])").last();
 
            // 마지막 페이지 파싱
            lastPage = Integer.valueOf(element.attr("href").split("=")[1]);
 
        } catch (IOException e) {
            e.printStackTrace();
        }
 
        return lastPage;
    }
 
    // 글목록을 가지고 오는 함수
    private List<Map<StringString>> getAritcleList(String url) {
        List<Map<StringString>> list = new ArrayList<Map<StringString>>();
 
        try {
            // 크롤링
            Document document = Jsoup.connect(url).get();
 
            // 마지막 페이지를 가진 Element 탐색
            Elements elements = document.select("div.post-item");
 
            // 글목록 파싱
            elements.forEach((element)->{
                Map<StringString> postMap = new HashMap<StringString>();
 
                // Element 탐색
                Element child = element.selectFirst("a[href]");
 
                // 데이터 추출
                String postId = child.attr("href");
                String title = child.selectFirst("span.title").text();
                String meta = child.selectFirst("span.meta").text();
 
                if( postId.startsWith("/") ) {
                    postId = postId.substring(1, postId.length());
                }
 
                // 데이터 저장
                postMap.put("url", URL+postId);
                postMap.put("title", title);
                postMap.put("meta", meta);
 
                // 리스트에 추가
                list.add(postMap);
 
                System.out.println(postMap.toString());
            });
 
        } catch (IOException e) {
            e.printStackTrace();
        }
 
        return list;
    }
}
 
cs

 

 

 

2. JobLauncher.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package com.dochi.quartz.crawl;
 
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Set;
 
import org.quartz.JobBuilder;
import org.quartz.JobDataMap;
import org.quartz.JobDetail;
import org.quartz.JobKey;
import org.quartz.Scheduler;
import org.quartz.SchedulerException;
import org.quartz.SchedulerFactory;
import org.quartz.SimpleScheduleBuilder;
import org.quartz.Trigger;
import org.quartz.TriggerBuilder;
import org.quartz.UnableToInterruptJobException;
import org.quartz.impl.StdSchedulerFactory;
import org.quartz.impl.matchers.GroupMatcher;
 
public class JobLauncher {
 
    // 상수 설정
    //   - Prefix 설정
    public static final String PREFIX_STEP_JOB_NAME = "job_";
    public static final String PREFIX_STEP_TRIGGER_NAME = "trigger_";
    public static final String PREFIX_NEXT_STEP_JOB_NAME = "step_job_";
    public static final String PREFIX_NEXT_STEP_TRIGGER_NAME = "step_trigger_";
 
    //   - DateFormat 설정
    public static final SimpleDateFormat TIMESTAMP_FMT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
    public static final SimpleDateFormat DATETIME_FMT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
 
    //   - JobDataMap에서 사용할 Key 정의
    public static final String MAIN_STEP_JOB_NAME = "mainStepJobName";
    public static final String NEXT_STEP_CLASS_NAME = "nextStepClassName";
    public static final String NEXT_STEP_JOB_NAME = "nextStepJobName";
 
    // Scheduler 객체 생성
    private static SchedulerFactory factory = null;
    private static Scheduler scheduler = null;
 
    // Main 함수
    public static void main(String[] args) throws SchedulerException {
        // Scheduler 실행
        start();
 
        // Schedule 등록
        addSchedule("CrawlerJob");
 
        try {
            System.out.println("아무키나 입력하면 종료됩니다...");
            System.in.read();
 
            // Scheduler 롱료
            stop();
 
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    // Scheduler 실행 함수
    public static void start() throws SchedulerException {
        // Scheduler 객체 정의
        factory = new StdSchedulerFactory();
        scheduler = factory.getScheduler();
 
        // Scheduler 실행
        scheduler.start();
    }
 
    // Scheduler 종료 함수
    public static void stop() throws SchedulerException {
        try {
            System.out.println("스케줄러가 종료됩니다...");
 
            // Job Key 목록
            Set<JobKey> allJobKeys = scheduler.getJobKeys(GroupMatcher.anyGroup());
 
            // Job 강제 중단
            allJobKeys.forEach((jobKey)->{
                try {
                    scheduler.interrupt(jobKey);
                } catch (UnableToInterruptJobException e) {
                    e.printStackTrace();
                }
            });
 
            // Scheduler 중단
            //   - true : 모든 Job이  완료될 때까지 대기 후 종료
            //   - false: 즉시 종료
            scheduler.shutdown(true);
 
            System.out.println("스케줄러가 종료되었습니다.");
        } catch (SchedulerException e) {
            e.printStackTrace();
        }
    }
 
    // Schedule 등록 함수
    public static void addSchedule(String name) throws SchedulerException {
        // JobDetail 설정
        JobDetail jobDetail = JobBuilder.newJob(CrawlerJob.class)
                                .withIdentity(PREFIX_STEP_JOB_NAME+name)
                                .build();
 
        // Schedule 생성
        //   - 5분마다 반복, 최대 3회
        SimpleScheduleBuilder schedule = SimpleScheduleBuilder.simpleSchedule()
                                            .withRepeatCount(3)
                                            .withIntervalInMinutes(5);
 
        // Trigger 설정
        Trigger trigger = TriggerBuilder.newTrigger()
                              .withIdentity(PREFIX_STEP_TRIGGER_NAME+name)
                              .withSchedule(schedule)
                              .forJob(jobDetail)
                              .build();
 
        // Schedule 등록
        scheduler.scheduleJob(jobDetail, trigger);
    }
}
 
cs

 

 

 

실행결과

  - 현재 블로그에 노출되는 모든 글의 제목, 작성일, URL을 수집

  - 5분마다 2번 반복

 

 

 

마치며

  드디어 크롤러를 스케줄에 따라 자동으로 수집하도록 설정하였다. 예제는 간단하게 만드느라 단순히 글 목록을 수집하고 출력하는 기능만 구현하였지만, 실사용하려면 저장소에 데이터를 보관하고 같은 데이터를 다시 수집하지 않도록해주는 설정이 필요할 것이다.

 

 

댓글