Implement remained CrawlHandler
This commit is contained in:
parent
8502b95a7d
commit
365b15e553
|
@ -1,12 +1,33 @@
|
||||||
package com.myoa.engineering.crawl.shopping.crawlhandler;
|
package com.myoa.engineering.crawl.shopping.crawlhandler;
|
||||||
|
|
||||||
|
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.FmkoreaArticleParser;
|
||||||
|
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||||
|
import com.myoa.engineering.crawl.shopping.infra.client.fmkorea.FmkoreaBoardClient;
|
||||||
|
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
|
||||||
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Component
|
@Component
|
||||||
public class FmkoreaCrawlHandler implements CrawlHandler {
|
public class FmkoreaCrawlHandler implements CrawlHandler {
|
||||||
|
|
||||||
|
private final FmkoreaBoardClient fmkoreaBoardClient;
|
||||||
|
private final FmkoreaArticleParser fmkoreaArticleParser;
|
||||||
|
private final ArticleCommandService articleCommandService;
|
||||||
|
|
||||||
|
public FmkoreaCrawlHandler(FmkoreaBoardClient fmkoreaBoardClient,
|
||||||
|
FmkoreaArticleParser fmkoreaArticleParser, ArticleCommandService articleCommandService) {
|
||||||
|
this.fmkoreaBoardClient = fmkoreaBoardClient;
|
||||||
|
this.fmkoreaArticleParser = fmkoreaArticleParser;
|
||||||
|
this.articleCommandService = articleCommandService;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CrawlTarget getCrawlTarget() {
|
public CrawlTarget getCrawlTarget() {
|
||||||
return CrawlTarget.FMKOREA;
|
return CrawlTarget.FMKOREA;
|
||||||
|
@ -14,5 +35,25 @@ public class FmkoreaCrawlHandler implements CrawlHandler {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void handle() {
|
public void handle() {
|
||||||
|
|
||||||
|
String boardHtmlPage1 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(1));
|
||||||
|
List<Article> parsedPage1 = fmkoreaArticleParser.parse(boardHtmlPage1);
|
||||||
|
|
||||||
|
String boardHtmlPage2 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(2));
|
||||||
|
List<Article> parsedPage2 = fmkoreaArticleParser.parse(boardHtmlPage2);
|
||||||
|
|
||||||
|
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
articleCommandService.upsert(merged);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> generateRequestParams(int pageId) {
|
||||||
|
Map<String, String> params = new HashMap<>();
|
||||||
|
params.put("mid", "hotdeal");
|
||||||
|
params.put("page", String.valueOf(pageId));
|
||||||
|
return params;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package com.myoa.engineering.crawl.shopping.crawlhandler;
|
||||||
|
|
||||||
|
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.PpomppuArticleParserV2;
|
||||||
|
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||||
|
import com.myoa.engineering.crawl.shopping.infra.client.ppomppu.PpomppuBoardClientV2;
|
||||||
|
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
|
||||||
|
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class PpomppuCrawlOverseaHandler implements CrawlHandler {
|
||||||
|
|
||||||
|
private final PpomppuBoardClientV2 ppomppuBoardClient;
|
||||||
|
private final PpomppuArticleParserV2 ppomppuArticleParserV2;
|
||||||
|
private final ArticleCommandService articleCommandService;
|
||||||
|
|
||||||
|
public PpomppuCrawlOverseaHandler(PpomppuBoardClientV2 ppomppuBoardClient,
|
||||||
|
PpomppuArticleParserV2 ppomppuArticleParserV2,
|
||||||
|
ArticleCommandService articleCommandService) {
|
||||||
|
this.ppomppuBoardClient = ppomppuBoardClient;
|
||||||
|
this.ppomppuArticleParserV2 = ppomppuArticleParserV2;
|
||||||
|
this.articleCommandService = articleCommandService;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CrawlTarget getCrawlTarget() {
|
||||||
|
return CrawlTarget.PPOMPPU_OVERSEA;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void handle() {
|
||||||
|
|
||||||
|
String boardHtmlPage1 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(1));
|
||||||
|
List<Article> parsedPage1 = ppomppuArticleParserV2.parse(boardHtmlPage1);
|
||||||
|
|
||||||
|
String boardHtmlPage2 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(2));
|
||||||
|
List<Article> parsedPage2 = ppomppuArticleParserV2.parse(boardHtmlPage2);
|
||||||
|
|
||||||
|
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
articleCommandService.upsert(merged);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> generateRequestParams(int pageId) {
|
||||||
|
Map<String, String> params = new HashMap<>();
|
||||||
|
params.put("id", "ppomppu4");
|
||||||
|
params.put("page", String.valueOf(pageId));
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
|
||||||
|
|
||||||
|
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||||
|
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||||
|
import com.myoa.engineering.crawl.shopping.util.DateTimeUtils;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Component
|
||||||
|
public class FmkoreaArticleParser {
|
||||||
|
|
||||||
|
private static final String FMKOREA_URL = "https://www.fmkorea.com";
|
||||||
|
|
||||||
|
public List<Article> parse(String html) {
|
||||||
|
Elements liElements = converHtmlToTrElements(html);
|
||||||
|
return liElements.stream()
|
||||||
|
// .filter(this::isRealArticle)
|
||||||
|
.map(this::parse)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Elements converHtmlToTrElements(String html) {
|
||||||
|
Document document = Jsoup.parse(html);
|
||||||
|
Element liTable = document.getElementsByClass("fm_best_widget").first();
|
||||||
|
return liTable.select("li.li_best2_pop0");
|
||||||
|
}
|
||||||
|
|
||||||
|
private Article parse(Element item) {
|
||||||
|
String link = item.select("h3.title a").attr("href");
|
||||||
|
Long articleId = Long.parseLong(link.replace("/", ""));
|
||||||
|
|
||||||
|
// https://www.fmkorea.com/7023440365
|
||||||
|
String articleUrl = FMKOREA_URL + link;
|
||||||
|
String boardName = item.select("span.category a").text().trim();
|
||||||
|
|
||||||
|
String title = item.select("h3.title a").text();
|
||||||
|
String itemPrice = item.select("div.hotdeal_info span:contains(가격) a").text();
|
||||||
|
String deliveryPrice = item.select("div.hotdeal_info span:contains(배송) a").text();
|
||||||
|
title = title + " 가격: " + itemPrice + " 배송: " + deliveryPrice;
|
||||||
|
String registeredAtString = item.select("span.regdate").text().trim();
|
||||||
|
ZonedDateTime registeredAt = DateTimeUtils.parse(registeredAtString, DateTimeUtils.FORMATTER_HHMM, DateTimeUtils.FORMATTER_YYMMDD_DOT);
|
||||||
|
|
||||||
|
Element recommendationElement = item.selectFirst("a.pc_voted_count");
|
||||||
|
Integer recommended = null;
|
||||||
|
if (recommendationElement != null) {
|
||||||
|
recommended = Integer.parseInt(recommendationElement.selectFirst("span.count").text());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Article.builder()
|
||||||
|
.articleId(articleId)
|
||||||
|
.title(title)
|
||||||
|
.boardName(boardName)
|
||||||
|
.articleUrl(articleUrl)
|
||||||
|
.recommended(recommended)
|
||||||
|
.registeredAt(registeredAt)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -21,14 +21,11 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Component
|
@Component
|
||||||
public final class PpomppuArticleParserV2 {
|
public class PpomppuArticleParserV2 {
|
||||||
|
|
||||||
private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss")
|
private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss")
|
||||||
.withZone(ZoneId.of("Asia/Seoul"));
|
.withZone(ZoneId.of("Asia/Seoul"));
|
||||||
|
|
||||||
private PpomppuArticleParserV2() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Article> parse(String html) {
|
public List<Article> parse(String html) {
|
||||||
Elements trElements = converHtmlToTrElements(html);
|
Elements trElements = converHtmlToTrElements(html);
|
||||||
return trElements.stream()
|
return trElements.stream()
|
||||||
|
@ -38,8 +35,8 @@ public final class PpomppuArticleParserV2 {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Elements converHtmlToTrElements(String data) {
|
private Elements converHtmlToTrElements(String html) {
|
||||||
Document document = Jsoup.parse(data);
|
Document document = Jsoup.parse(html);
|
||||||
Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr");
|
Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr");
|
||||||
return trList;
|
return trList;
|
||||||
}
|
}
|
||||||
|
@ -72,7 +69,7 @@ public final class PpomppuArticleParserV2 {
|
||||||
String boardName = parseBoardName(title);
|
String boardName = parseBoardName(title);
|
||||||
Integer recommended = parseRecommended(tdList.get(4));
|
Integer recommended = parseRecommended(tdList.get(4));
|
||||||
Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0);
|
Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0);
|
||||||
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text());
|
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text(), DateTimeUtils.FORMATTER_HHMMss, DateTimeUtils.FORMATTER_YYMMDD_SLASH);
|
||||||
|
|
||||||
return Article.builder()
|
return Article.builder()
|
||||||
.articleId(articleId)
|
.articleId(articleId)
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
package com.myoa.engineering.crawl.shopping.infra.client.fmkorea;
|
package com.myoa.engineering.crawl.shopping.infra.client.fmkorea;
|
||||||
|
|
||||||
import org.springframework.cloud.openfeign.FeignClient;
|
import org.springframework.cloud.openfeign.FeignClient;
|
||||||
|
import org.springframework.cloud.openfeign.SpringQueryMap;
|
||||||
import org.springframework.web.bind.annotation.GetMapping;
|
import org.springframework.web.bind.annotation.GetMapping;
|
||||||
import org.springframework.web.bind.annotation.PathVariable;
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
|
||||||
@FeignClient(value = "fmkorea-board-client", url = "https://fmkorea.com")
|
import java.util.Map;
|
||||||
|
|
||||||
|
@FeignClient(value = "fmkorea-board-client", url = "https://www.fmkorea.com")
|
||||||
public interface FmkoreaBoardClient {
|
public interface FmkoreaBoardClient {
|
||||||
|
|
||||||
@GetMapping("{boardLink}")
|
@GetMapping("{boardLink}")
|
||||||
String getBoardHtml(@PathVariable("boardLink") String boardLink);
|
String getBoardHtml(@PathVariable("boardLink") String boardLink,
|
||||||
|
@SpringQueryMap Map<String, String> params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
package com.myoa.engineering.crawl.shopping.infra.client.ppomppu;
|
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* PpomppuBoardFeedRetriever
|
|
||||||
*
|
|
||||||
* @author Shin Woo-jin (woozu.shin@kakaoent.com)
|
|
||||||
* @since 2021-09-08
|
|
||||||
*/
|
|
||||||
@Slf4j
|
|
||||||
@Component
|
|
||||||
public class PpomppuBoardClient {
|
|
||||||
/*
|
|
||||||
private final WebClient webClient;
|
|
||||||
|
|
||||||
public PpomppuBoardClient(WebClient.Builder webClientBuilder) {
|
|
||||||
this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL)
|
|
||||||
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
|
|
||||||
.filter(WebClientFilterFactory.logRequest())
|
|
||||||
.filter(WebClientFilterFactory.logResponse())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Mono<String> getHtml(String uri) {
|
|
||||||
return webClient.get()
|
|
||||||
.uri(uri)
|
|
||||||
.exchangeToMono(e -> e.bodyToMono(String.class))
|
|
||||||
.publishOn(Schedulers.boundedElastic())
|
|
||||||
.onErrorResume(WebClientRequestException.class, t -> {
|
|
||||||
log.info("Exception occured, ignoring. : {}", t.getClass().getSimpleName());
|
|
||||||
return Mono.empty();
|
|
||||||
});
|
|
||||||
// .doOnNext(e -> log.info("[getHtml] {}", e));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
|
@ -3,6 +3,7 @@ package com.myoa.engineering.crawl.shopping.scheduler;
|
||||||
import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler;
|
import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||||
|
import org.springframework.scheduling.annotation.Scheduled;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -18,7 +19,7 @@ public class ParseEventEmitter {
|
||||||
this.crawlHandlers = crawlHandlers;
|
this.crawlHandlers = crawlHandlers;
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Scheduled(cron = "0 0/5 * * * ?")
|
@Scheduled(cron = "0 0/5 * * * ?")
|
||||||
public void emit() {
|
public void emit() {
|
||||||
log.info("[emitDomesticBoard] trigger fired!");
|
log.info("[emitDomesticBoard] trigger fired!");
|
||||||
crawlHandlers.forEach(CrawlHandler::handle);
|
crawlHandlers.forEach(CrawlHandler::handle);
|
||||||
|
|
|
@ -1,27 +1,38 @@
|
||||||
package com.myoa.engineering.crawl.shopping.util;
|
package com.myoa.engineering.crawl.shopping.util;
|
||||||
|
|
||||||
|
import org.springframework.lang.Nullable;
|
||||||
|
|
||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
|
|
||||||
public final class DateTimeUtils {
|
public final class DateTimeUtils {
|
||||||
|
|
||||||
private static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss");
|
public static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss");
|
||||||
|
public static final DateTimeFormatter FORMATTER_YYMMDD_SLASH = DateTimeFormatter.ofPattern("yy/MM/dd");
|
||||||
|
public static final DateTimeFormatter FORMATTER_YYMMDD_DOT = DateTimeFormatter.ofPattern("yyyy.MM.dd");
|
||||||
|
public static final DateTimeFormatter FORMATTER_HHMM = DateTimeFormatter.ofPattern("HH:mm");
|
||||||
private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul");
|
private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul");
|
||||||
|
|
||||||
private DateTimeUtils() {
|
private DateTimeUtils() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ZonedDateTime parse(String HHMMss) {
|
public static ZonedDateTime parse(String dateTimeString, DateTimeFormatter formatter, @Nullable DateTimeFormatter fallback) {
|
||||||
try {
|
try {
|
||||||
LocalTime time = LocalTime.parse(HHMMss, FORMATTER_HHMMss);
|
LocalTime time = LocalTime.parse(dateTimeString, formatter);
|
||||||
LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time);
|
LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time);
|
||||||
if (dateTime.isAfter(LocalDateTime.now())) {
|
if (dateTime.isAfter(LocalDateTime.now())) {
|
||||||
dateTime = dateTime.minusDays(1);
|
dateTime = dateTime.minusDays(1);
|
||||||
}
|
}
|
||||||
return dateTime.atZone(ZONE_ASIA_SEOUL);
|
return dateTime.atZone(ZONE_ASIA_SEOUL);
|
||||||
} catch (Exception e) {
|
} catch (Exception ignored) {
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
LocalDate date = LocalDate.parse(dateTimeString, fallback);
|
||||||
|
return date.atStartOfDay(ZONE_ASIA_SEOUL);
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
|
||||||
|
|
||||||
|
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||||
|
import com.myoa.engineering.crawl.shopping.util.TestDataUtils;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class FmkoreaArticleParserTest {
|
||||||
|
|
||||||
|
private FmkoreaArticleParser sut;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
sut = new FmkoreaArticleParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parse() {
|
||||||
|
// given
|
||||||
|
String boardHtml = TestDataUtils.fileToString("testdata/fmkorea/file1.html");
|
||||||
|
|
||||||
|
// when
|
||||||
|
List<Article> actual = sut.parse(boardHtml);
|
||||||
|
|
||||||
|
// then
|
||||||
|
Assertions.assertEquals(20, actual.size());
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -7,8 +7,8 @@ import lombok.Getter;
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public enum CrawlTarget {
|
public enum CrawlTarget {
|
||||||
PPOMPPU_DOMESTIC("뽐뿌국내", true),
|
PPOMPPU_DOMESTIC("뽐뿌국내", true),
|
||||||
PPOMPPU_OVERSEA("뽐뿌해외", false),
|
PPOMPPU_OVERSEA("뽐뿌해외", true),
|
||||||
FMKOREA("펨코", false),
|
FMKOREA("펨코", true),
|
||||||
;
|
;
|
||||||
|
|
||||||
private final String alias;
|
private final String alias;
|
||||||
|
|
Loading…
Reference in New Issue