Post

[Java] Jsoup ์†๋„ ๊ฐœ์„  - ForkJoinPool / ForkJoin Framework

[Java] Jsoup ์†๋„ ๊ฐœ์„  - ForkJoinPool / ForkJoin Framework

๐Ÿ“Œ ๋ฌธ์ œ์˜ ์ฝ”๋“œ

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    @Transactional
    public void saveTools() {
        List<SubCategory> subCategories = subCategoryRepository.findAll();
        
        for (SubCategory subCategory : subCategories) {
            String categoryUrl = subCategory.getSubCategoryUrl();
            int page = 1;

            while (true) {
                try {
                    String pageUrl = categoryUrl + "page/" + page + "/";
                    Document doc = Jsoup.connect(pageUrl).get();

                    Element latestPosts = doc.selectFirst("div[class='latest-posts']");
                    if (latestPosts == null) break;

                    Elements posts = latestPosts.select("div[class^=post-item]");
                    if (posts.isEmpty()) break;

                    for (Element post : posts) {
                        processPost(post);
                    }
                    page++;
                } catch (IOException e) {
                    log.error("ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {} (์นดํ…Œ๊ณ ๋ฆฌ: {})", e.getMessage(), subCategory.getSubCategoryName());
                    break;
                }
            }
        }
    }

    @Transactional
    protected void processPost(Element post) {
        Element titleElement = post.selectFirst("span.post-title a.dark-title");
        if (titleElement == null) return;

        String detailUrl = titleElement.attr("href");
        try {
            Document detailDoc = Jsoup.connect(detailUrl).get();
            String toolName = detailDoc.selectFirst("span[class*=post-title]").text();
            String description = detailDoc.selectFirst("span.desc-text").text();

            if (!toolsRepository.existsByToolName(toolName)) {
                Tool savedTool = saveTool(toolName, description, detailUrl);
                processCategories(detailDoc, savedTool);
            }
        } catch (IOException e) {
            log.error("๋„๊ตฌ ์ƒ์„ธ ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {}", e.getMessage());
        }
    }

    private Tool saveTool(String toolName, String description, String detailUrl) {
        Tool tool = Tool.builder()
                .toolName(toolName)
                .toolDescription(description)
                .toolLink(detailUrl)
                .toolCategories(new ArrayList<>())
                .build();

        Tool savedTool = toolsRepository.save(tool);
        log.info("์ €์žฅ๋œ AI ๋„๊ตฌ: {}", toolName);
        return savedTool;
    }

    private void processCategories(Document detailDoc, Tool savedTool) {
        Elements categoryElements = detailDoc.select("div.entry-categories a span[data-title]");
        for (Element categoryElement : categoryElements) {
            String categoryName = categoryElement.attr("data-title").trim();
            
            SubCategory subCat = subCategoryRepository.findBySubCategoryName(categoryName)
                    .orElse(null);

            if (subCat != null) {
                ToolCategory toolCategory = ToolCategory.builder()
                        .tool(savedTool)
                        .subCategory(subCat)
                        .build();

                savedTool.getToolCategories().add(toolCategory);
                if (subCat.getToolCategories() == null) {
                    subCat.setToolCategories(new ArrayList<>());
                }
                subCat.getToolCategories().add(toolCategory);

                toolCategoryRepository.save(toolCategory);
                
                log.info("๋„๊ตฌ-์นดํ…Œ๊ณ ๋ฆฌ ์—ฐ๊ฒฐ: {} - {}", savedTool.getToolName(), categoryName);
            }
        }
    }

ํŠน์ • ์›น์‚ฌ์ดํŠธ๋ฅผ ํฌ๋กค๋งํ•˜๋Š” ์ฝ”๋“œ์ด๋‹ค. ๊ฐ„๋žตํ•˜๊ฒŒ ํ•จ์ˆ˜์˜ ๋™์ž‘์— ๋Œ€ํ•ด ์•Œ์•„๋ณด์ž.

  • saveTools()

DB์— ์กด์žฌํ•˜๋Š” SubCategory๋ฅผ ์กฐํšŒํ•˜์—ฌ ๊ฐ SubCategory์˜ URL์„ ๊ธฐ๋ฐ˜์œผ๋กœ ํŽ˜์ด์ง€๋„ค์ด์…˜ํ•˜์—ฌ ํฌ๋กค๋ง์„ ์ง„ํ–‰ํ•œ๋‹ค. ๊ฐ ํŽ˜์ด์ง€์—์„œ post๋ฅผ ์ˆ˜์ง‘ํ•œ๋‹ค.

  • processPost()

์ˆ˜์ง‘ํ•œ post์— ๋Œ€ํ•˜์—ฌ ์ •๋ณด๋“ค์„ ์ˆ˜์ง‘ํ•œ๋‹ค.

  • saveTool()

์ˆ˜์ง‘ํ•œ ์ •๋ณด๋“ค์„ ํ† ๋Œ€๋กœ Tool ์—”ํ‹ฐํ‹ฐ๋ฅผ ์ƒ์„ฑํ•˜๊ณ  DB์— ์ €์žฅํ•œ๋‹ค.

  • processCategories()

Tool๊ณผ SubCategory ๊ฐ„ ์—ฐ๊ด€๊ด€๊ณ„๋ฅผ ์„ค์ •ํ•œ๋‹ค. ToolCategory ์—”ํ‹ฐํ‹ฐ๋Š” Tool๊ณผ SubCategory ๊ฐ„ ๋‹ค๋Œ€๋‹ค ๊ด€๊ณ„๋ฅผ ํ’€๊ธฐ ์œ„ํ•œ ์—”ํ‹ฐํ‹ฐ์ด๋‹ค.

์™„์ „ํžˆ ํฌ๋กค๋ง์„ ์ˆ˜ํ–‰ํ•˜๊ธฐ ๊ฑธ๋ฆฌ๋Š” ์‹œ๊ฐ„์€ ์•ฝ 3์‹œ๊ฐ„์ด๋‹ค. ์ฃผ๊ธฐ์ ์œผ๋กœ ํฌ๋กค๋ง์„ ํ•˜์ง€ ์•Š์„ ์˜ˆ์ •์ด๊ธฐ ๋•Œ๋ฌธ์— ์ „์ฒด์ ์ธ ์„œ๋น„์Šค ์ธก๋ฉด์—์„œ๋Š” ๊ตณ์ด ์„ฑ๋Šฅ ๊ฐœ์„ ์„ ํ•˜์ง€ ์•Š๊ณ  ๋„˜์–ด๊ฐ€๋„ ๋ฌด๋ฐฉํ•˜๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์„ฑ๋Šฅ์„ ์ค‘์š”์‹œ ์—ฌ๊ธฐ๋Š” ๋‚˜๋กœ์„œ๋Š” ๊ทธ๋ƒฅ ๋„˜์–ด๊ฐˆ ์ˆ˜ ์—†๋‹ค. ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด ์˜๋ฏธ ์žˆ๋Š” ์„ฑ๋Šฅ ๊ฐœ์„ ์ด ์ด๋ฃจ์–ด์งˆ ๊ฒƒ์ด๋ผ๊ณ  ์ƒ๊ฐํ–ˆ๊ณ , ์ด๋ฒˆ ๊ธฐํšŒ์— ์ž๋ฐ”๋กœ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์ฝ”๋“œ๋ฅผ ์ž‘์„ฑํ•˜๊ณ  ์‹ถ์—ˆ๋‹ค.

๋‚˜๋Š” Fork/Join Framework๋ฅผ ์‚ฌ์šฉํ–ˆ๋‹ค. ์ด๋ฒˆ ๊ธฐํšŒ๋ฅผ ํ†ตํ•ด ์—ฌ๋Ÿฌ๊ฐ€์ง€ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•์ด ์กด์žฌํ•œ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ๊ฒŒ ๋˜์—ˆ๋Š”๋ฐ, Fork/Join Framework์— ๋Œ€ํ•œ ๋‚ด์šฉ์„ ์ •๋ฆฌํ•˜๋ฉฐ ํ•˜๋‚˜์”ฉ ์‚ดํŽด๋ณด๋ ค๊ณ  ํ•œ๋‹ค.

๐Ÿ“Œ ๊ฐœ์„ ๋œ ์ฝ”๋“œ

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    @Transactional
    public void saveTools() {
        List<SubCategory> subCategories = subCategoryRepository.findAll();
        
        // ForkJoinPool์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
        ForkJoinPool customThreadPool = new ForkJoinPool(4);
        try {
            customThreadPool.submit(() ->
                subCategories.parallelStream().forEach(subCategory -> {
                    try {
                        processSubCategory(subCategory);
                    } catch (Exception e) {
                        log.error("์นดํ…Œ๊ณ ๋ฆฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {} (์นดํ…Œ๊ณ ๋ฆฌ: {})", 
                            e.getMessage(), subCategory.getSubCategoryName());
                    }
                })
            ).get();
        } catch (Exception e) {
            log.error("๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {}", e.getMessage());
        } finally {
            customThreadPool.shutdown();
        }
    }

    @Transactional
    protected void processSubCategory(SubCategory subCategory) {
        String categoryUrl = subCategory.getSubCategoryUrl();
        int page = 1;
        
        while (true) {
            try {
                String pageUrl = categoryUrl + "page/" + page + "/";
                Document doc = Jsoup.connect(pageUrl)
                        .timeout(10000)
                        .get();

                Element latestPosts = doc.selectFirst("div[class='latest-posts']");
                if (latestPosts == null) break;

                Elements posts = latestPosts.select("div[class^=post-item]");
                if (posts.isEmpty()) break;

                for (Element post : posts) {
                    try {
                        processPost(post);
                    } catch (Exception e) {
                        log.error("ํฌ์ŠคํŠธ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {}", e.getMessage());
                    }
                }
                
                page++;
                Thread.sleep(1000);
                
            } catch (Exception e) {
                log.error("ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {} (์นดํ…Œ๊ณ ๋ฆฌ: {})", 
                    e.getMessage(), subCategory.getSubCategoryName());
                break;
            }
        }
    }

    @Transactional
    protected void processPost(Element post) {
        Element titleElement = post.selectFirst("span.post-title a.dark-title");
        if (titleElement == null) return;

        String detailUrl = titleElement.attr("href");
        try {
            Document detailDoc = Jsoup.connect(detailUrl)
                    .timeout(10000)
                    .get();
            String toolName = detailDoc.selectFirst("span[class*=post-title]").text();
            String description = detailDoc.selectFirst("span.desc-text").text();

            if (!toolsRepository.existsByToolName(toolName)) {
                Tool savedTool = saveTool(toolName, description, detailUrl);
                processCategories(detailDoc, savedTool);
            }
        } catch (IOException e) {
            log.error("๋„๊ตฌ ์ƒ์„ธ ํŽ˜์ด์ง€ ํฌ๋กค๋ง ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {}", e.getMessage());
        }
    }

    private Tool saveTool(String toolName, String description, String detailUrl) {
        Tool tool = Tool.builder()
                .toolName(toolName)
                .toolDescription(description)
                .toolLink(detailUrl)
                .toolCategories(new ArrayList<>())
                .build();

        Tool savedTool = toolsRepository.save(tool);
        log.info("์ €์žฅ๋œ AI ๋„๊ตฌ: {}", toolName);
        return savedTool;
    }

    @Transactional
    protected void processCategories(Document detailDoc, Tool savedTool) {
        Elements categoryElements = detailDoc.select("div.entry-categories a span[data-title]");
        for (Element categoryElement : categoryElements) {
            String categoryName = categoryElement.attr("data-title").trim();
            
            SubCategory subCat = subCategoryRepository.findBySubCategoryName(categoryName)
                    .orElse(null);

            if (subCat != null) {
                ToolCategory toolCategory = ToolCategory.builder()
                        .tool(savedTool)
                        .subCategory(subCat)
                        .build();

                toolCategory = toolCategoryRepository.save(toolCategory);
                
                savedTool.getToolCategories().add(toolCategory);
                
                log.info("๋„๊ตฌ-์นดํ…Œ๊ณ ๋ฆฌ ์—ฐ๊ฒฐ: {} - {}", savedTool.getToolName(), categoryName);
            }
        }
    }
  • saveTools()
1
ForkJoinPool customThreadPool = new ForkJoinPool(4);

4๊ฐœ์˜ ์“ฐ๋ ˆ๋“œ๋กœ ๊ตฌ์„ฑ๋œ pool์„ ์ƒ์„ฑํ•œ๋‹ค.

1
2
3
4
5
customThreadPool.submit(() ->
    subCategories.parallelStream().forEach(subCategory -> {
        processSubCategory(subCategory);
    })
).get();

ํŽธ์˜์ƒ ๋กœ๊น…์€ ์ œ๊ฑฐํ–ˆ๋‹ค. parallelStream() ์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ณ‘๋ ฌ ์ŠคํŠธ๋ฆผ์„ ์ƒ์„ฑํ•œ๋‹ค. ๊ฐ ์ŠคํŠธ๋ฆผ์— ๋Œ€ํ•˜์—ฌ processSubCategory()๋ฅผ ์ˆ˜ํ–‰ํ•œ๋‹ค. get() ์€ submit()์œผ๋กœ ์ œ์ถœํ•œ ์ž‘์—…์ด ์™„๋ฃŒ๋  ๋•Œ๊นŒ์ง€ ํ˜„์žฌ ์“ฐ๋ ˆ๋“œ๋ฅผ blockํ•˜๋Š” ์—ญํ• ์„ ํ•œ๋‹ค.

  • processSubCategory()
1
Thread.sleep(1000);

๊ณผ๋„ํ•œ ํŠธ๋ž˜ํ”ฝ์„ ๋ง‰๊ธฐ ์œ„ํ•ด ํ˜„์žฌ ์“ฐ๋ ˆ๋“œ๋ฅผ 1์ดˆ ๋™์•ˆ ์ค‘์ง€ํ•œ๋‹ค.

๐Ÿ“Œ ์„ฑ๋Šฅ ๋ณ€ํ™”

๊ธฐ์กด ์ฝ”๋“œ๋Š” ์•ฝ 3์‹œ๊ฐ„ ์†Œ์š”๋˜์—ˆ์œผ๋‚˜, ๊ฐœ์„ ๋œ ์ฝ”๋“œ๋Š” ์•ฝ 35๋ถ„์ด ์†Œ์š”๋˜์—ˆ๋‹ค. ์•ฝ 414% ์„ฑ๋Šฅ ๊ฐœ์„  ํšจ๊ณผ๋ฅผ ๋ณด์•˜๋‹ค.

This post is licensed under CC BY 4.0 by the author.