[JAVA] Quartz 스케줄러 만들기 (5)

서론

이번 포스트에서는 드디어 스케줄러에 Crawler를 적용해보려고 한다. 기존 예제들은 주로 스케줄러를 설정했었다면, 이번 시간에는 실제 스케줄에 맞춰서 실제 실행되는 작업을 구현할 계획이다. 예제니까 간단하게 현재 블로그의 전체 글목록을 조회하는 Crawler를 만들 계획이며, Jsoup 라이브러리를 이용하려고 한다.

개발환경

- jdk-11.0.5
- quartz-2.3.2

- jsoup-1.11.3

<!-- Scheduler -->
<dependency>
  <groupId>org.quartz-scheduler</groupId>
  <artifactId>quartz</artifactId>
  <version>2.3.2</version>
</dependency>
<!-- Scheduler -->

<!-- Crawler -->
<dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.11.3</version>
</dependency>
<!-- Crawler -->

<!-- Logging -->
<dependency>
  <groupId>org.slf4j</groupId>
  <artifactId>slf4j-api</artifactId>
  <version>1.7.25</version>
</dependency>
<dependency>
  <groupId>ch.qos.logback</groupId>
  <artifactId>logback-classic</artifactId>
  <version>1.2.3</version>
</dependency>
<!-- Logging -->

소스코드

1. CrawlerJob.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

package com.dochi.quartz.crawl;
 
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.InterruptableJob;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.UnableToInterruptJobException;
 
import com.dochi.quartz.step.JobLauncher;
 
 
public class CrawlerJob implements InterruptableJob {
 
    private Thread currentThread = null;
    private final String URL = "https://heodolf.tistory.com/";
 
    @Override
    public void execute(JobExecutionContext context) throws JobExecutionException {
        // 현재 Thread 저장
        this.currentThread = Thread.currentThread();
 
        System.out.println(String.format("[%s][%s][%s] START"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , context.getJobDetail().getKey().getName()));
 
        // 페이지 정보 수집
        int startPage = 1;
        int endPage = getEndPage();
 
        // 페이지별로 글목록 수집
        List<Map<String, String>> postList = new ArrayList<Map<String, String>>();
        for(int page=endPage; page>=startPage; page--) {
            postList.addAll(getAritcleList(URL+"?page="+page));
        }
 
        System.out.println(String.format("[%s][%s] - 조회 완료 ( %d건 )"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , postList.size()));
        System.out.println(String.format("[%s][%s][%s] END"
                , JobLauncher.TIMESTAMP_FMT.format(new Date())
                , this.getClass().getName()
                , context.getJobDetail().getKey().getName()));
    }
 
    @Override
    public void interrupt() throws UnableToInterruptJobException {
        // interrupt 설정
        //   - 강제종료
        if( this.currentThread != null ) {
            this.currentThread.interrupt();
        }
    }
 
    // 마지막 페이지를 가지고 오는 함수
    private int getEndPage() {
 
        int lastPage = 1;
 
        try {
            // 크롤링
            Document document = Jsoup.connect(URL).get();
 
            // 마지막 페이지를 가진 Element 탐색
            Element element = document.select(".pagination > a[href]:not([class])").last();
 
            // 마지막 페이지 파싱
            lastPage = Integer.valueOf(element.attr("href").split("=")[1]);
 
        } catch (IOException e) {
            e.printStackTrace();
        }
 
        return lastPage;
    }
 
    // 글목록을 가지고 오는 함수
    private List<Map<String, String>> getAritcleList(String url) {
        List<Map<String, String>> list = new ArrayList<Map<String, String>>();
 
        try {
            // 크롤링
            Document document = Jsoup.connect(url).get();
 
            // 마지막 페이지를 가진 Element 탐색
            Elements elements = document.select("div.post-item");
 
            // 글목록 파싱
            elements.forEach((element)->{
                Map<String, String> postMap = new HashMap<String, String>();
 
                // Element 탐색
                Element child = element.selectFirst("a[href]");
 
                // 데이터 추출
                String postId = child.attr("href");
                String title = child.selectFirst("span.title").text();
                String meta = child.selectFirst("span.meta").text();
 
                if( postId.startsWith("/") ) {
                    postId = postId.substring(1, postId.length());
                }
 
                // 데이터 저장
                postMap.put("url", URL+postId);
                postMap.put("title", title);
                postMap.put("meta", meta);
 
                // 리스트에 추가
                list.add(postMap);
 
                System.out.println(postMap.toString());
            });
 
        } catch (IOException e) {
            e.printStackTrace();
        }
 
        return list;
    }
}
 
Colored by Color Scripter

cs

2. JobLauncher.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

package com.dochi.quartz.crawl;
 
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Set;
 
import org.quartz.JobBuilder;
import org.quartz.JobDataMap;
import org.quartz.JobDetail;
import org.quartz.JobKey;
import org.quartz.Scheduler;
import org.quartz.SchedulerException;
import org.quartz.SchedulerFactory;
import org.quartz.SimpleScheduleBuilder;
import org.quartz.Trigger;
import org.quartz.TriggerBuilder;
import org.quartz.UnableToInterruptJobException;
import org.quartz.impl.StdSchedulerFactory;
import org.quartz.impl.matchers.GroupMatcher;
 
public class JobLauncher {
 
    // 상수 설정
    //   - Prefix 설정
    public static final String PREFIX_STEP_JOB_NAME = "job_";
    public static final String PREFIX_STEP_TRIGGER_NAME = "trigger_";
    public static final String PREFIX_NEXT_STEP_JOB_NAME = "step_job_";
    public static final String PREFIX_NEXT_STEP_TRIGGER_NAME = "step_trigger_";
 
    //   - DateFormat 설정
    public static final SimpleDateFormat TIMESTAMP_FMT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
    public static final SimpleDateFormat DATETIME_FMT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
 
    //   - JobDataMap에서 사용할 Key 정의
    public static final String MAIN_STEP_JOB_NAME = "mainStepJobName";
    public static final String NEXT_STEP_CLASS_NAME = "nextStepClassName";
    public static final String NEXT_STEP_JOB_NAME = "nextStepJobName";
 
    // Scheduler 객체 생성
    private static SchedulerFactory factory = null;
    private static Scheduler scheduler = null;
 
    // Main 함수
    public static void main(String[] args) throws SchedulerException {
        // Scheduler 실행
        start();
 
        // Schedule 등록
        addSchedule("CrawlerJob");
 
        try {
            System.out.println("아무키나 입력하면 종료됩니다...");
            System.in.read();
 
            // Scheduler 롱료
            stop();
 
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    // Scheduler 실행 함수
    public static void start() throws SchedulerException {
        // Scheduler 객체 정의
        factory = new StdSchedulerFactory();
        scheduler = factory.getScheduler();
 
        // Scheduler 실행
        scheduler.start();
    }
 
    // Scheduler 종료 함수
    public static void stop() throws SchedulerException {
        try {
            System.out.println("스케줄러가 종료됩니다...");
 
            // Job Key 목록
            Set<JobKey> allJobKeys = scheduler.getJobKeys(GroupMatcher.anyGroup());
 
            // Job 강제 중단
            allJobKeys.forEach((jobKey)->{
                try {
                    scheduler.interrupt(jobKey);
                } catch (UnableToInterruptJobException e) {
                    e.printStackTrace();
                }
            });
 
            // Scheduler 중단
            //   - true : 모든 Job이  완료될 때까지 대기 후 종료
            //   - false: 즉시 종료
            scheduler.shutdown(true);
 
            System.out.println("스케줄러가 종료되었습니다.");
        } catch (SchedulerException e) {
            e.printStackTrace();
        }
    }
 
    // Schedule 등록 함수
    public static void addSchedule(String name) throws SchedulerException {
        // JobDetail 설정
        JobDetail jobDetail = JobBuilder.newJob(CrawlerJob.class)
                                .withIdentity(PREFIX_STEP_JOB_NAME+name)
                                .build();
 
        // Schedule 생성
        //   - 5분마다 반복, 최대 3회
        SimpleScheduleBuilder schedule = SimpleScheduleBuilder.simpleSchedule()
                                            .withRepeatCount(3)
                                            .withIntervalInMinutes(5);
 
        // Trigger 설정
        Trigger trigger = TriggerBuilder.newTrigger()
                              .withIdentity(PREFIX_STEP_TRIGGER_NAME+name)
                              .withSchedule(schedule)
                              .forJob(jobDetail)
                              .build();
 
        // Schedule 등록
        scheduler.scheduleJob(jobDetail, trigger);
    }
}
 
Colored by Color Scripter

cs

실행결과

- 현재 블로그에 노출되는 모든 글의 제목, 작성일, URL을 수집

- 5분마다 2번 반복

마치며

드디어 크롤러를 스케줄에 따라 자동으로 수집하도록 설정하였다. 예제는 간단하게 만드느라 단순히 글 목록을 수집하고 출력하는 기능만 구현하였지만, 실사용하려면 저장소에 데이터를 보관하고 같은 데이터를 다시 수집하지 않도록해주는 설정이 필요할 것이다.

저작자표시

'Back-end > JAVA' 카테고리의 다른 글

[JAVA] SQLite 무작정 시작하기 (2) - TABLE 생성/삭제 (0)	2020.11.15
[JAVA] SQLite 무작정 시작하기 (1) - DATABASE 연결/해제 (0)	2020.11.15
[JAVA] Quartz 스케줄러 만들기 (4) - Step By Step (0)	2020.11.11
[JAVA] Quartz 스케줄러 만들기 (3) - Job 강제 종료 (0)	2020.11.11
[JAVA] Quartz 스케줄러 만들기 (2) - Listener (0)	2020.11.10

개발하는 도치

[JAVA] Quartz 스케줄러 만들기 (5) - Crawler Job

서론

개발환경

소스코드

1. CrawlerJob.java

2. JobLauncher.java

실행결과

마치며

'Back-end > JAVA' 카테고리의 다른 글

댓글

티스토리툴바

[JAVA] Quartz 스케줄러 만들기 (5) - Crawler Job

서론

개발환경

소스코드

1. CrawlerJob.java

2. JobLauncher.java

실행결과

마치며

'Back-end > JAVA' 카테고리의 다른 글

관련글

댓글

티스토리툴바