[Java] Jsoup ์๋ ๊ฐ์ - ForkJoinPool / ForkJoin Framework
๐ ๋ฌธ์ ์ ์ฝ๋
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@Transactional
public void saveTools() {
List<SubCategory> subCategories = subCategoryRepository.findAll();
for (SubCategory subCategory : subCategories) {
String categoryUrl = subCategory.getSubCategoryUrl();
int page = 1;
while (true) {
try {
String pageUrl = categoryUrl + "page/" + page + "/";
Document doc = Jsoup.connect(pageUrl).get();
Element latestPosts = doc.selectFirst("div[class='latest-posts']");
if (latestPosts == null) break;
Elements posts = latestPosts.select("div[class^=post-item]");
if (posts.isEmpty()) break;
for (Element post : posts) {
processPost(post);
}
page++;
} catch (IOException e) {
log.error("ํ์ด์ง ํฌ๋กค๋ง ์ค ์ค๋ฅ ๋ฐ์: {} (์นดํ
๊ณ ๋ฆฌ: {})", e.getMessage(), subCategory.getSubCategoryName());
break;
}
}
}
}
@Transactional
protected void processPost(Element post) {
Element titleElement = post.selectFirst("span.post-title a.dark-title");
if (titleElement == null) return;
String detailUrl = titleElement.attr("href");
try {
Document detailDoc = Jsoup.connect(detailUrl).get();
String toolName = detailDoc.selectFirst("span[class*=post-title]").text();
String description = detailDoc.selectFirst("span.desc-text").text();
if (!toolsRepository.existsByToolName(toolName)) {
Tool savedTool = saveTool(toolName, description, detailUrl);
processCategories(detailDoc, savedTool);
}
} catch (IOException e) {
log.error("๋๊ตฌ ์์ธ ํ์ด์ง ํฌ๋กค๋ง ์ค ์ค๋ฅ ๋ฐ์: {}", e.getMessage());
}
}
private Tool saveTool(String toolName, String description, String detailUrl) {
Tool tool = Tool.builder()
.toolName(toolName)
.toolDescription(description)
.toolLink(detailUrl)
.toolCategories(new ArrayList<>())
.build();
Tool savedTool = toolsRepository.save(tool);
log.info("์ ์ฅ๋ AI ๋๊ตฌ: {}", toolName);
return savedTool;
}
private void processCategories(Document detailDoc, Tool savedTool) {
Elements categoryElements = detailDoc.select("div.entry-categories a span[data-title]");
for (Element categoryElement : categoryElements) {
String categoryName = categoryElement.attr("data-title").trim();
SubCategory subCat = subCategoryRepository.findBySubCategoryName(categoryName)
.orElse(null);
if (subCat != null) {
ToolCategory toolCategory = ToolCategory.builder()
.tool(savedTool)
.subCategory(subCat)
.build();
savedTool.getToolCategories().add(toolCategory);
if (subCat.getToolCategories() == null) {
subCat.setToolCategories(new ArrayList<>());
}
subCat.getToolCategories().add(toolCategory);
toolCategoryRepository.save(toolCategory);
log.info("๋๊ตฌ-์นดํ
๊ณ ๋ฆฌ ์ฐ๊ฒฐ: {} - {}", savedTool.getToolName(), categoryName);
}
}
}
ํน์ ์น์ฌ์ดํธ๋ฅผ ํฌ๋กค๋งํ๋ ์ฝ๋์ด๋ค. ๊ฐ๋ตํ๊ฒ ํจ์์ ๋์์ ๋ํด ์์๋ณด์.
- saveTools()
DB์ ์กด์ฌํ๋ SubCategory๋ฅผ ์กฐํํ์ฌ ๊ฐ SubCategory์ URL์ ๊ธฐ๋ฐ์ผ๋ก ํ์ด์ง๋ค์ด์ ํ์ฌ ํฌ๋กค๋ง์ ์งํํ๋ค. ๊ฐ ํ์ด์ง์์ post๋ฅผ ์์งํ๋ค.
- processPost()
์์งํ post์ ๋ํ์ฌ ์ ๋ณด๋ค์ ์์งํ๋ค.
- saveTool()
์์งํ ์ ๋ณด๋ค์ ํ ๋๋ก Tool ์ํฐํฐ๋ฅผ ์์ฑํ๊ณ DB์ ์ ์ฅํ๋ค.
- processCategories()
Tool๊ณผ SubCategory ๊ฐ ์ฐ๊ด๊ด๊ณ๋ฅผ ์ค์ ํ๋ค. ToolCategory ์ํฐํฐ๋ Tool๊ณผ SubCategory ๊ฐ ๋ค๋๋ค ๊ด๊ณ๋ฅผ ํ๊ธฐ ์ํ ์ํฐํฐ์ด๋ค.
์์ ํ ํฌ๋กค๋ง์ ์ํํ๊ธฐ ๊ฑธ๋ฆฌ๋ ์๊ฐ์ ์ฝ 3์๊ฐ์ด๋ค. ์ฃผ๊ธฐ์ ์ผ๋ก ํฌ๋กค๋ง์ ํ์ง ์์ ์์ ์ด๊ธฐ ๋๋ฌธ์ ์ ์ฒด์ ์ธ ์๋น์ค ์ธก๋ฉด์์๋ ๊ตณ์ด ์ฑ๋ฅ ๊ฐ์ ์ ํ์ง ์๊ณ ๋์ด๊ฐ๋ ๋ฌด๋ฐฉํ๋ค. ๊ทธ๋ฌ๋ ์ฑ๋ฅ์ ์ค์์ ์ฌ๊ธฐ๋ ๋๋ก์๋ ๊ทธ๋ฅ ๋์ด๊ฐ ์ ์๋ค. ๋ณ๋ ฌ ์ฒ๋ฆฌ๋ฅผ ์ฌ์ฉํ๋ฉด ์๋ฏธ ์๋ ์ฑ๋ฅ ๊ฐ์ ์ด ์ด๋ฃจ์ด์ง ๊ฒ์ด๋ผ๊ณ ์๊ฐํ๊ณ , ์ด๋ฒ ๊ธฐํ์ ์๋ฐ๋ก ๋ณ๋ ฌ ์ฒ๋ฆฌ ์ฝ๋๋ฅผ ์์ฑํ๊ณ ์ถ์๋ค.
๋๋ Fork/Join Framework
๋ฅผ ์ฌ์ฉํ๋ค. ์ด๋ฒ ๊ธฐํ๋ฅผ ํตํด ์ฌ๋ฌ๊ฐ์ง ๋ณ๋ ฌ ์ฒ๋ฆฌ ๋ฐฉ๋ฒ์ด ์กด์ฌํ๋ค๋ ๊ฒ์ ์๊ฒ ๋์๋๋ฐ, Fork/Join Framework์ ๋ํ ๋ด์ฉ์ ์ ๋ฆฌํ๋ฉฐ ํ๋์ฉ ์ดํด๋ณด๋ ค๊ณ ํ๋ค.
๐ ๊ฐ์ ๋ ์ฝ๋
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
@Transactional
public void saveTools() {
List<SubCategory> subCategories = subCategoryRepository.findAll();
// ForkJoinPool์ ์ฌ์ฉํ์ฌ ๋ณ๋ ฌ ์ฒ๋ฆฌ
ForkJoinPool customThreadPool = new ForkJoinPool(4);
try {
customThreadPool.submit(() ->
subCategories.parallelStream().forEach(subCategory -> {
try {
processSubCategory(subCategory);
} catch (Exception e) {
log.error("์นดํ
๊ณ ๋ฆฌ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {} (์นดํ
๊ณ ๋ฆฌ: {})",
e.getMessage(), subCategory.getSubCategoryName());
}
})
).get();
} catch (Exception e) {
log.error("๋ณ๋ ฌ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {}", e.getMessage());
} finally {
customThreadPool.shutdown();
}
}
@Transactional
protected void processSubCategory(SubCategory subCategory) {
String categoryUrl = subCategory.getSubCategoryUrl();
int page = 1;
while (true) {
try {
String pageUrl = categoryUrl + "page/" + page + "/";
Document doc = Jsoup.connect(pageUrl)
.timeout(10000)
.get();
Element latestPosts = doc.selectFirst("div[class='latest-posts']");
if (latestPosts == null) break;
Elements posts = latestPosts.select("div[class^=post-item]");
if (posts.isEmpty()) break;
for (Element post : posts) {
try {
processPost(post);
} catch (Exception e) {
log.error("ํฌ์คํธ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {}", e.getMessage());
}
}
page++;
Thread.sleep(1000);
} catch (Exception e) {
log.error("ํ์ด์ง ํฌ๋กค๋ง ์ค ์ค๋ฅ ๋ฐ์: {} (์นดํ
๊ณ ๋ฆฌ: {})",
e.getMessage(), subCategory.getSubCategoryName());
break;
}
}
}
@Transactional
protected void processPost(Element post) {
Element titleElement = post.selectFirst("span.post-title a.dark-title");
if (titleElement == null) return;
String detailUrl = titleElement.attr("href");
try {
Document detailDoc = Jsoup.connect(detailUrl)
.timeout(10000)
.get();
String toolName = detailDoc.selectFirst("span[class*=post-title]").text();
String description = detailDoc.selectFirst("span.desc-text").text();
if (!toolsRepository.existsByToolName(toolName)) {
Tool savedTool = saveTool(toolName, description, detailUrl);
processCategories(detailDoc, savedTool);
}
} catch (IOException e) {
log.error("๋๊ตฌ ์์ธ ํ์ด์ง ํฌ๋กค๋ง ์ค ์ค๋ฅ ๋ฐ์: {}", e.getMessage());
}
}
private Tool saveTool(String toolName, String description, String detailUrl) {
Tool tool = Tool.builder()
.toolName(toolName)
.toolDescription(description)
.toolLink(detailUrl)
.toolCategories(new ArrayList<>())
.build();
Tool savedTool = toolsRepository.save(tool);
log.info("์ ์ฅ๋ AI ๋๊ตฌ: {}", toolName);
return savedTool;
}
@Transactional
protected void processCategories(Document detailDoc, Tool savedTool) {
Elements categoryElements = detailDoc.select("div.entry-categories a span[data-title]");
for (Element categoryElement : categoryElements) {
String categoryName = categoryElement.attr("data-title").trim();
SubCategory subCat = subCategoryRepository.findBySubCategoryName(categoryName)
.orElse(null);
if (subCat != null) {
ToolCategory toolCategory = ToolCategory.builder()
.tool(savedTool)
.subCategory(subCat)
.build();
toolCategory = toolCategoryRepository.save(toolCategory);
savedTool.getToolCategories().add(toolCategory);
log.info("๋๊ตฌ-์นดํ
๊ณ ๋ฆฌ ์ฐ๊ฒฐ: {} - {}", savedTool.getToolName(), categoryName);
}
}
}
- saveTools()
1
ForkJoinPool customThreadPool = new ForkJoinPool(4);
4๊ฐ์ ์ฐ๋ ๋๋ก ๊ตฌ์ฑ๋ pool์ ์์ฑํ๋ค.
1
2
3
4
5
customThreadPool.submit(() ->
subCategories.parallelStream().forEach(subCategory -> {
processSubCategory(subCategory);
})
).get();
ํธ์์ ๋ก๊น
์ ์ ๊ฑฐํ๋ค. parallelStream()
์ ์ฌ์ฉํ์ฌ ๋ณ๋ ฌ ์คํธ๋ฆผ์ ์์ฑํ๋ค. ๊ฐ ์คํธ๋ฆผ์ ๋ํ์ฌ processSubCategory()๋ฅผ ์ํํ๋ค. get()
์ submit()
์ผ๋ก ์ ์ถํ ์์
์ด ์๋ฃ๋ ๋๊น์ง ํ์ฌ ์ฐ๋ ๋๋ฅผ blockํ๋ ์ญํ ์ ํ๋ค.
- processSubCategory()
1
Thread.sleep(1000);
๊ณผ๋ํ ํธ๋ํฝ์ ๋ง๊ธฐ ์ํด ํ์ฌ ์ฐ๋ ๋๋ฅผ 1์ด ๋์ ์ค์งํ๋ค.
๐ ์ฑ๋ฅ ๋ณํ
๊ธฐ์กด ์ฝ๋๋ ์ฝ 3์๊ฐ ์์๋์์ผ๋, ๊ฐ์ ๋ ์ฝ๋๋ ์ฝ 35๋ถ์ด ์์๋์๋ค. ์ฝ 414% ์ฑ๋ฅ ๊ฐ์ ํจ๊ณผ๋ฅผ ๋ณด์๋ค.