From 365b15e5534f233110e4be3daa3a0821570be6e3 Mon Sep 17 00:00:00 2001 From: woozu-shin Date: Sun, 12 May 2024 23:53:54 +0900 Subject: [PATCH] Implement remained CrawlHandler --- .../crawlhandler/FmkoreaCrawlHandler.java | 41 + .../PpomppuCrawlOverseaHandler.java | 58 + .../parser/FmkoreaArticleParser.java | 68 + .../parser/PpomppuArticleParserV2.java | 11 +- .../client/fmkorea/FmkoreaBoardClient.java | 8 +- .../client/ppomppu/PpomppuBoardClient.java | 38 - .../shopping/scheduler/ParseEventEmitter.java | 3 +- .../crawl/shopping/util/DateTimeUtils.java | 21 +- .../parser/FmkoreaArticleParserTest.java | 31 + .../resources/testdata/fmkorea/file1.html | 2182 +++++++++++++++++ .../support/dto/constant/CrawlTarget.java | 4 +- 11 files changed, 2410 insertions(+), 55 deletions(-) create mode 100644 shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/PpomppuCrawlOverseaHandler.java create mode 100644 shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParser.java delete mode 100644 shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/ppomppu/PpomppuBoardClient.java create mode 100644 shopping-crawler/src/test/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParserTest.java create mode 100644 shopping-crawler/src/test/resources/testdata/fmkorea/file1.html diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/FmkoreaCrawlHandler.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/FmkoreaCrawlHandler.java index bfb6938..27f35d6 100644 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/FmkoreaCrawlHandler.java +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/FmkoreaCrawlHandler.java @@ -1,12 +1,33 @@ package com.myoa.engineering.crawl.shopping.crawlhandler; +import com.myoa.engineering.crawl.shopping.crawlhandler.parser.FmkoreaArticleParser; +import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article; +import com.myoa.engineering.crawl.shopping.infra.client.fmkorea.FmkoreaBoardClient; +import com.myoa.engineering.crawl.shopping.service.ArticleCommandService; import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + @Slf4j @Component public class FmkoreaCrawlHandler implements CrawlHandler { + + private final FmkoreaBoardClient fmkoreaBoardClient; + private final FmkoreaArticleParser fmkoreaArticleParser; + private final ArticleCommandService articleCommandService; + + public FmkoreaCrawlHandler(FmkoreaBoardClient fmkoreaBoardClient, + FmkoreaArticleParser fmkoreaArticleParser, ArticleCommandService articleCommandService) { + this.fmkoreaBoardClient = fmkoreaBoardClient; + this.fmkoreaArticleParser = fmkoreaArticleParser; + this.articleCommandService = articleCommandService; + } + @Override public CrawlTarget getCrawlTarget() { return CrawlTarget.FMKOREA; @@ -14,5 +35,25 @@ public class FmkoreaCrawlHandler implements CrawlHandler { @Override public void handle() { + + String boardHtmlPage1 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(1)); + List
parsedPage1 = fmkoreaArticleParser.parse(boardHtmlPage1); + + String boardHtmlPage2 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(2)); + List
parsedPage2 = fmkoreaArticleParser.parse(boardHtmlPage2); + + List
merged = Stream.of(parsedPage1, parsedPage2) + .flatMap(List::stream) + .map(e -> e.updateCrawlTarget(getCrawlTarget())) + .toList(); + + articleCommandService.upsert(merged); + } + + private Map generateRequestParams(int pageId) { + Map params = new HashMap<>(); + params.put("mid", "hotdeal"); + params.put("page", String.valueOf(pageId)); + return params; } } diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/PpomppuCrawlOverseaHandler.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/PpomppuCrawlOverseaHandler.java new file mode 100644 index 0000000..c1e43d4 --- /dev/null +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/PpomppuCrawlOverseaHandler.java @@ -0,0 +1,58 @@ +package com.myoa.engineering.crawl.shopping.crawlhandler; + +import com.myoa.engineering.crawl.shopping.crawlhandler.parser.PpomppuArticleParserV2; +import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article; +import com.myoa.engineering.crawl.shopping.infra.client.ppomppu.PpomppuBoardClientV2; +import com.myoa.engineering.crawl.shopping.service.ArticleCommandService; +import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget; +import org.springframework.stereotype.Component; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +@Component +public class PpomppuCrawlOverseaHandler implements CrawlHandler { + + private final PpomppuBoardClientV2 ppomppuBoardClient; + private final PpomppuArticleParserV2 ppomppuArticleParserV2; + private final ArticleCommandService articleCommandService; + + public PpomppuCrawlOverseaHandler(PpomppuBoardClientV2 ppomppuBoardClient, + PpomppuArticleParserV2 ppomppuArticleParserV2, + ArticleCommandService articleCommandService) { + this.ppomppuBoardClient = ppomppuBoardClient; + this.ppomppuArticleParserV2 = ppomppuArticleParserV2; + this.articleCommandService = articleCommandService; + } + + @Override + public CrawlTarget getCrawlTarget() { + return CrawlTarget.PPOMPPU_OVERSEA; + } + + @Override + public void handle() { + + String boardHtmlPage1 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(1)); + List
parsedPage1 = ppomppuArticleParserV2.parse(boardHtmlPage1); + + String boardHtmlPage2 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(2)); + List
parsedPage2 = ppomppuArticleParserV2.parse(boardHtmlPage2); + + List
merged = Stream.of(parsedPage1, parsedPage2) + .flatMap(List::stream) + .map(e -> e.updateCrawlTarget(getCrawlTarget())) + .toList(); + + articleCommandService.upsert(merged); + } + + private Map generateRequestParams(int pageId) { + Map params = new HashMap<>(); + params.put("id", "ppomppu4"); + params.put("page", String.valueOf(pageId)); + return params; + } +} diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParser.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParser.java new file mode 100644 index 0000000..17a9662 --- /dev/null +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParser.java @@ -0,0 +1,68 @@ +package com.myoa.engineering.crawl.shopping.crawlhandler.parser; + +import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article; +import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget; +import com.myoa.engineering.crawl.shopping.util.DateTimeUtils; +import lombok.extern.slf4j.Slf4j; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.stereotype.Component; + +import java.time.ZonedDateTime; +import java.util.List; + +@Slf4j +@Component +public class FmkoreaArticleParser { + + private static final String FMKOREA_URL = "https://www.fmkorea.com"; + + public List
parse(String html) { + Elements liElements = converHtmlToTrElements(html); + return liElements.stream() +// .filter(this::isRealArticle) + .map(this::parse) + .toList(); + } + + private Elements converHtmlToTrElements(String html) { + Document document = Jsoup.parse(html); + Element liTable = document.getElementsByClass("fm_best_widget").first(); + return liTable.select("li.li_best2_pop0"); + } + + private Article parse(Element item) { + String link = item.select("h3.title a").attr("href"); + Long articleId = Long.parseLong(link.replace("/", "")); + + // https://www.fmkorea.com/7023440365 + String articleUrl = FMKOREA_URL + link; + String boardName = item.select("span.category a").text().trim(); + + String title = item.select("h3.title a").text(); + String itemPrice = item.select("div.hotdeal_info span:contains(가격) a").text(); + String deliveryPrice = item.select("div.hotdeal_info span:contains(배송) a").text(); + title = title + " 가격: " + itemPrice + " 배송: " + deliveryPrice; + String registeredAtString = item.select("span.regdate").text().trim(); + ZonedDateTime registeredAt = DateTimeUtils.parse(registeredAtString, DateTimeUtils.FORMATTER_HHMM, DateTimeUtils.FORMATTER_YYMMDD_DOT); + + Element recommendationElement = item.selectFirst("a.pc_voted_count"); + Integer recommended = null; + if (recommendationElement != null) { + recommended = Integer.parseInt(recommendationElement.selectFirst("span.count").text()); + } + + return Article.builder() + .articleId(articleId) + .title(title) + .boardName(boardName) + .articleUrl(articleUrl) + .recommended(recommended) + .registeredAt(registeredAt) + .build(); + } + + +} diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/PpomppuArticleParserV2.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/PpomppuArticleParserV2.java index 850f8a1..d8644b9 100644 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/PpomppuArticleParserV2.java +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/PpomppuArticleParserV2.java @@ -21,14 +21,11 @@ import java.util.regex.Pattern; @Slf4j @Component -public final class PpomppuArticleParserV2 { +public class PpomppuArticleParserV2 { private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss") .withZone(ZoneId.of("Asia/Seoul")); - private PpomppuArticleParserV2() { - } - public List
parse(String html) { Elements trElements = converHtmlToTrElements(html); return trElements.stream() @@ -38,8 +35,8 @@ public final class PpomppuArticleParserV2 { } - private Elements converHtmlToTrElements(String data) { - Document document = Jsoup.parse(data); + private Elements converHtmlToTrElements(String html) { + Document document = Jsoup.parse(html); Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr"); return trList; } @@ -72,7 +69,7 @@ public final class PpomppuArticleParserV2 { String boardName = parseBoardName(title); Integer recommended = parseRecommended(tdList.get(4)); Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0); - ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text()); + ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text(), DateTimeUtils.FORMATTER_HHMMss, DateTimeUtils.FORMATTER_YYMMDD_SLASH); return Article.builder() .articleId(articleId) diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/fmkorea/FmkoreaBoardClient.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/fmkorea/FmkoreaBoardClient.java index 1d0f0e0..7c402bf 100644 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/fmkorea/FmkoreaBoardClient.java +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/fmkorea/FmkoreaBoardClient.java @@ -1,12 +1,16 @@ package com.myoa.engineering.crawl.shopping.infra.client.fmkorea; import org.springframework.cloud.openfeign.FeignClient; +import org.springframework.cloud.openfeign.SpringQueryMap; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; -@FeignClient(value = "fmkorea-board-client", url = "https://fmkorea.com") +import java.util.Map; + +@FeignClient(value = "fmkorea-board-client", url = "https://www.fmkorea.com") public interface FmkoreaBoardClient { @GetMapping("{boardLink}") - String getBoardHtml(@PathVariable("boardLink") String boardLink); + String getBoardHtml(@PathVariable("boardLink") String boardLink, + @SpringQueryMap Map params); } diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/ppomppu/PpomppuBoardClient.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/ppomppu/PpomppuBoardClient.java deleted file mode 100644 index 1ab1b8e..0000000 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/infra/client/ppomppu/PpomppuBoardClient.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.myoa.engineering.crawl.shopping.infra.client.ppomppu; - -import lombok.extern.slf4j.Slf4j; -import org.springframework.stereotype.Component; - -/** - * PpomppuBoardFeedRetriever - * - * @author Shin Woo-jin (woozu.shin@kakaoent.com) - * @since 2021-09-08 - */ -@Slf4j -@Component -public class PpomppuBoardClient { -/* - private final WebClient webClient; - - public PpomppuBoardClient(WebClient.Builder webClientBuilder) { - this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL) - .exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml()) - .filter(WebClientFilterFactory.logRequest()) - .filter(WebClientFilterFactory.logResponse()) - .build(); - } - - public Mono getHtml(String uri) { - return webClient.get() - .uri(uri) - .exchangeToMono(e -> e.bodyToMono(String.class)) - .publishOn(Schedulers.boundedElastic()) - .onErrorResume(WebClientRequestException.class, t -> { - log.info("Exception occured, ignoring. : {}", t.getClass().getSimpleName()); - return Mono.empty(); - }); - // .doOnNext(e -> log.info("[getHtml] {}", e)); - } -*/ -} diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/scheduler/ParseEventEmitter.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/scheduler/ParseEventEmitter.java index ae65851..6f99680 100644 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/scheduler/ParseEventEmitter.java +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/scheduler/ParseEventEmitter.java @@ -3,6 +3,7 @@ package com.myoa.engineering.crawl.shopping.scheduler; import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler; import lombok.extern.slf4j.Slf4j; import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import java.util.List; @@ -18,7 +19,7 @@ public class ParseEventEmitter { this.crawlHandlers = crawlHandlers; } - // @Scheduled(cron = "0 0/5 * * * ?") + @Scheduled(cron = "0 0/5 * * * ?") public void emit() { log.info("[emitDomesticBoard] trigger fired!"); crawlHandlers.forEach(CrawlHandler::handle); diff --git a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/util/DateTimeUtils.java b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/util/DateTimeUtils.java index 3ec9d5a..da333f8 100644 --- a/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/util/DateTimeUtils.java +++ b/shopping-crawler/src/main/java/com/myoa/engineering/crawl/shopping/util/DateTimeUtils.java @@ -1,27 +1,38 @@ package com.myoa.engineering.crawl.shopping.util; +import org.springframework.lang.Nullable; + import java.time.*; import java.time.format.DateTimeFormatter; public final class DateTimeUtils { - private static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss"); + public static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss"); + public static final DateTimeFormatter FORMATTER_YYMMDD_SLASH = DateTimeFormatter.ofPattern("yy/MM/dd"); + public static final DateTimeFormatter FORMATTER_YYMMDD_DOT = DateTimeFormatter.ofPattern("yyyy.MM.dd"); + public static final DateTimeFormatter FORMATTER_HHMM = DateTimeFormatter.ofPattern("HH:mm"); private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul"); private DateTimeUtils() { } - public static ZonedDateTime parse(String HHMMss) { + public static ZonedDateTime parse(String dateTimeString, DateTimeFormatter formatter, @Nullable DateTimeFormatter fallback) { try { - LocalTime time = LocalTime.parse(HHMMss, FORMATTER_HHMMss); + LocalTime time = LocalTime.parse(dateTimeString, formatter); LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time); if (dateTime.isAfter(LocalDateTime.now())) { dateTime = dateTime.minusDays(1); } return dateTime.atZone(ZONE_ASIA_SEOUL); - } catch (Exception e) { - return null; + } catch (Exception ignored) { } + + try { + LocalDate date = LocalDate.parse(dateTimeString, fallback); + return date.atStartOfDay(ZONE_ASIA_SEOUL); + } catch (Exception ignored) { + } + return null; } } diff --git a/shopping-crawler/src/test/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParserTest.java b/shopping-crawler/src/test/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParserTest.java new file mode 100644 index 0000000..9b7affe --- /dev/null +++ b/shopping-crawler/src/test/java/com/myoa/engineering/crawl/shopping/crawlhandler/parser/FmkoreaArticleParserTest.java @@ -0,0 +1,31 @@ +package com.myoa.engineering.crawl.shopping.crawlhandler.parser; + +import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article; +import com.myoa.engineering.crawl.shopping.util.TestDataUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; + +class FmkoreaArticleParserTest { + + private FmkoreaArticleParser sut; + + @BeforeEach + void setUp() { + sut = new FmkoreaArticleParser(); + } + + @Test + void parse() { + // given + String boardHtml = TestDataUtils.fileToString("testdata/fmkorea/file1.html"); + + // when + List
actual = sut.parse(boardHtml); + + // then + Assertions.assertEquals(20, actual.size()); + } +} \ No newline at end of file diff --git a/shopping-crawler/src/test/resources/testdata/fmkorea/file1.html b/shopping-crawler/src/test/resources/testdata/fmkorea/file1.html new file mode 100644 index 0000000..6d44d9f --- /dev/null +++ b/shopping-crawler/src/test/resources/testdata/fmkorea/file1.html @@ -0,0 +1,2182 @@ + + + + + + + + + 핫딜 - 에펨코리아 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+ + + + + + +
+
+ +
+
+ + +

+ + + [연관 게시판] 자유쇼핑 + + +

+
+
+
+
+
+

+ + 핫딜 + +

+
+
+ 관리자: 독고 +
+
+
+
+
+
+ + 핫딜 + +
+ 인기 +
+
+ +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ 공지 + + 핫딜게시판 통합공지사항 + + 독고 +
+ 공지 + + 핫딜 게시판 포텐 우대 쇼핑몰 목록 + + 공지 +
+ 공지 + + 회원들의 정보요구, 구매대행 등 허용하고 있지 않음 + + 독고 +
+ 공지 + + 이슈에 편승하는 단순 포빨이용 드립성 게시글을 금지 + + 독고 +
+ + 공지 더 보기(1 + 개) +
+ 인기 + + [카카오톡] 자동차 에어컨필터 (1,000원) (무료) + 93 + + 붉은색푸른색 +
+ 인기 + + [티몬] 탑텐 셔츠 여러종 (4,950원~) (무료) + 55 + + 12월9일목요일 +
+ 인기 + + [알토란마켓] [임박] 사우어 미니 네온웜즈 구미 50g*70봉 (3.5kg) (9,900원) (-) + 46 + + 6x9=74 +
+ 인기 + + [쿠팡와우] 펩시 제로 슈거 무라벨 콜라 라임향 300ml 20개 (11,690원) (0원) + 10 + + 골박무새 +
+ 인기 + + [지마켓] 올반 슈퍼크런치 치킨텐더 440g 5팩 (23,910원) (무료) + 32 + + 밤닉밤 +
+
+ +
+
+
+
+
+ + + + + + + + + + + +
+
+
+ 인기글 +
+
+
+
+ 게시판 목록 페이징 + + + + + + + + 이전 + + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + + ... + + + 다음 + +
+ + + / 4,347 + + + + +
+
+
+
+
+ + + + +
+
+
+
+ +
+
+
+
+
+
+
+
+
    +
    +
    + +
    +
    +
    +
    + + + +
    + + + + + + + + diff --git a/support/src/main/java/com/myoa/engineering/crawl/shopping/support/dto/constant/CrawlTarget.java b/support/src/main/java/com/myoa/engineering/crawl/shopping/support/dto/constant/CrawlTarget.java index fb2b552..f3e3392 100644 --- a/support/src/main/java/com/myoa/engineering/crawl/shopping/support/dto/constant/CrawlTarget.java +++ b/support/src/main/java/com/myoa/engineering/crawl/shopping/support/dto/constant/CrawlTarget.java @@ -7,8 +7,8 @@ import lombok.Getter; @AllArgsConstructor public enum CrawlTarget { PPOMPPU_DOMESTIC("뽐뿌국내", true), - PPOMPPU_OVERSEA("뽐뿌해외", false), - FMKOREA("펨코", false), + PPOMPPU_OVERSEA("뽐뿌해외", true), + FMKOREA("펨코", true), ; private final String alias;