Implement remained CrawlHandler

This commit is contained in:
woozu-shin 2024-05-12 23:53:54 +09:00
parent 8502b95a7d
commit 365b15e553
11 changed files with 2410 additions and 55 deletions

View File

@ -1,12 +1,33 @@
package com.myoa.engineering.crawl.shopping.crawlhandler; package com.myoa.engineering.crawl.shopping.crawlhandler;
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.FmkoreaArticleParser;
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
import com.myoa.engineering.crawl.shopping.infra.client.fmkorea.FmkoreaBoardClient;
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget; import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
@Slf4j @Slf4j
@Component @Component
public class FmkoreaCrawlHandler implements CrawlHandler { public class FmkoreaCrawlHandler implements CrawlHandler {
private final FmkoreaBoardClient fmkoreaBoardClient;
private final FmkoreaArticleParser fmkoreaArticleParser;
private final ArticleCommandService articleCommandService;
public FmkoreaCrawlHandler(FmkoreaBoardClient fmkoreaBoardClient,
FmkoreaArticleParser fmkoreaArticleParser, ArticleCommandService articleCommandService) {
this.fmkoreaBoardClient = fmkoreaBoardClient;
this.fmkoreaArticleParser = fmkoreaArticleParser;
this.articleCommandService = articleCommandService;
}
@Override @Override
public CrawlTarget getCrawlTarget() { public CrawlTarget getCrawlTarget() {
return CrawlTarget.FMKOREA; return CrawlTarget.FMKOREA;
@ -14,5 +35,25 @@ public class FmkoreaCrawlHandler implements CrawlHandler {
@Override @Override
public void handle() { public void handle() {
String boardHtmlPage1 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(1));
List<Article> parsedPage1 = fmkoreaArticleParser.parse(boardHtmlPage1);
String boardHtmlPage2 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(2));
List<Article> parsedPage2 = fmkoreaArticleParser.parse(boardHtmlPage2);
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
.flatMap(List::stream)
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
.toList();
articleCommandService.upsert(merged);
}
private Map<String, String> generateRequestParams(int pageId) {
Map<String, String> params = new HashMap<>();
params.put("mid", "hotdeal");
params.put("page", String.valueOf(pageId));
return params;
} }
} }

View File

@ -0,0 +1,58 @@
package com.myoa.engineering.crawl.shopping.crawlhandler;
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.PpomppuArticleParserV2;
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
import com.myoa.engineering.crawl.shopping.infra.client.ppomppu.PpomppuBoardClientV2;
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
import org.springframework.stereotype.Component;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
@Component
public class PpomppuCrawlOverseaHandler implements CrawlHandler {
private final PpomppuBoardClientV2 ppomppuBoardClient;
private final PpomppuArticleParserV2 ppomppuArticleParserV2;
private final ArticleCommandService articleCommandService;
public PpomppuCrawlOverseaHandler(PpomppuBoardClientV2 ppomppuBoardClient,
PpomppuArticleParserV2 ppomppuArticleParserV2,
ArticleCommandService articleCommandService) {
this.ppomppuBoardClient = ppomppuBoardClient;
this.ppomppuArticleParserV2 = ppomppuArticleParserV2;
this.articleCommandService = articleCommandService;
}
@Override
public CrawlTarget getCrawlTarget() {
return CrawlTarget.PPOMPPU_OVERSEA;
}
@Override
public void handle() {
String boardHtmlPage1 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(1));
List<Article> parsedPage1 = ppomppuArticleParserV2.parse(boardHtmlPage1);
String boardHtmlPage2 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(2));
List<Article> parsedPage2 = ppomppuArticleParserV2.parse(boardHtmlPage2);
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
.flatMap(List::stream)
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
.toList();
articleCommandService.upsert(merged);
}
private Map<String, String> generateRequestParams(int pageId) {
Map<String, String> params = new HashMap<>();
params.put("id", "ppomppu4");
params.put("page", String.valueOf(pageId));
return params;
}
}

View File

@ -0,0 +1,68 @@
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
import com.myoa.engineering.crawl.shopping.util.DateTimeUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.time.ZonedDateTime;
import java.util.List;
@Slf4j
@Component
public class FmkoreaArticleParser {
private static final String FMKOREA_URL = "https://www.fmkorea.com";
public List<Article> parse(String html) {
Elements liElements = converHtmlToTrElements(html);
return liElements.stream()
// .filter(this::isRealArticle)
.map(this::parse)
.toList();
}
private Elements converHtmlToTrElements(String html) {
Document document = Jsoup.parse(html);
Element liTable = document.getElementsByClass("fm_best_widget").first();
return liTable.select("li.li_best2_pop0");
}
private Article parse(Element item) {
String link = item.select("h3.title a").attr("href");
Long articleId = Long.parseLong(link.replace("/", ""));
// https://www.fmkorea.com/7023440365
String articleUrl = FMKOREA_URL + link;
String boardName = item.select("span.category a").text().trim();
String title = item.select("h3.title a").text();
String itemPrice = item.select("div.hotdeal_info span:contains(가격) a").text();
String deliveryPrice = item.select("div.hotdeal_info span:contains(배송) a").text();
title = title + " 가격: " + itemPrice + " 배송: " + deliveryPrice;
String registeredAtString = item.select("span.regdate").text().trim();
ZonedDateTime registeredAt = DateTimeUtils.parse(registeredAtString, DateTimeUtils.FORMATTER_HHMM, DateTimeUtils.FORMATTER_YYMMDD_DOT);
Element recommendationElement = item.selectFirst("a.pc_voted_count");
Integer recommended = null;
if (recommendationElement != null) {
recommended = Integer.parseInt(recommendationElement.selectFirst("span.count").text());
}
return Article.builder()
.articleId(articleId)
.title(title)
.boardName(boardName)
.articleUrl(articleUrl)
.recommended(recommended)
.registeredAt(registeredAt)
.build();
}
}

View File

@ -21,14 +21,11 @@ import java.util.regex.Pattern;
@Slf4j @Slf4j
@Component @Component
public final class PpomppuArticleParserV2 { public class PpomppuArticleParserV2 {
private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss") private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss")
.withZone(ZoneId.of("Asia/Seoul")); .withZone(ZoneId.of("Asia/Seoul"));
private PpomppuArticleParserV2() {
}
public List<Article> parse(String html) { public List<Article> parse(String html) {
Elements trElements = converHtmlToTrElements(html); Elements trElements = converHtmlToTrElements(html);
return trElements.stream() return trElements.stream()
@ -38,8 +35,8 @@ public final class PpomppuArticleParserV2 {
} }
private Elements converHtmlToTrElements(String data) { private Elements converHtmlToTrElements(String html) {
Document document = Jsoup.parse(data); Document document = Jsoup.parse(html);
Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr"); Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr");
return trList; return trList;
} }
@ -72,7 +69,7 @@ public final class PpomppuArticleParserV2 {
String boardName = parseBoardName(title); String boardName = parseBoardName(title);
Integer recommended = parseRecommended(tdList.get(4)); Integer recommended = parseRecommended(tdList.get(4));
Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0); Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0);
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text()); ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text(), DateTimeUtils.FORMATTER_HHMMss, DateTimeUtils.FORMATTER_YYMMDD_SLASH);
return Article.builder() return Article.builder()
.articleId(articleId) .articleId(articleId)

View File

@ -1,12 +1,16 @@
package com.myoa.engineering.crawl.shopping.infra.client.fmkorea; package com.myoa.engineering.crawl.shopping.infra.client.fmkorea;
import org.springframework.cloud.openfeign.FeignClient; import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.cloud.openfeign.SpringQueryMap;
import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PathVariable;
@FeignClient(value = "fmkorea-board-client", url = "https://fmkorea.com") import java.util.Map;
@FeignClient(value = "fmkorea-board-client", url = "https://www.fmkorea.com")
public interface FmkoreaBoardClient { public interface FmkoreaBoardClient {
@GetMapping("{boardLink}") @GetMapping("{boardLink}")
String getBoardHtml(@PathVariable("boardLink") String boardLink); String getBoardHtml(@PathVariable("boardLink") String boardLink,
@SpringQueryMap Map<String, String> params);
} }

View File

@ -1,38 +0,0 @@
package com.myoa.engineering.crawl.shopping.infra.client.ppomppu;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
/**
* PpomppuBoardFeedRetriever
*
* @author Shin Woo-jin (woozu.shin@kakaoent.com)
* @since 2021-09-08
*/
@Slf4j
@Component
public class PpomppuBoardClient {
/*
private final WebClient webClient;
public PpomppuBoardClient(WebClient.Builder webClientBuilder) {
this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL)
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
.filter(WebClientFilterFactory.logRequest())
.filter(WebClientFilterFactory.logResponse())
.build();
}
public Mono<String> getHtml(String uri) {
return webClient.get()
.uri(uri)
.exchangeToMono(e -> e.bodyToMono(String.class))
.publishOn(Schedulers.boundedElastic())
.onErrorResume(WebClientRequestException.class, t -> {
log.info("Exception occured, ignoring. : {}", t.getClass().getSimpleName());
return Mono.empty();
});
// .doOnNext(e -> log.info("[getHtml] {}", e));
}
*/
}

View File

@ -3,6 +3,7 @@ package com.myoa.engineering.crawl.shopping.scheduler;
import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler; import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.List; import java.util.List;
@ -18,7 +19,7 @@ public class ParseEventEmitter {
this.crawlHandlers = crawlHandlers; this.crawlHandlers = crawlHandlers;
} }
// @Scheduled(cron = "0 0/5 * * * ?") @Scheduled(cron = "0 0/5 * * * ?")
public void emit() { public void emit() {
log.info("[emitDomesticBoard] trigger fired!"); log.info("[emitDomesticBoard] trigger fired!");
crawlHandlers.forEach(CrawlHandler::handle); crawlHandlers.forEach(CrawlHandler::handle);

View File

@ -1,27 +1,38 @@
package com.myoa.engineering.crawl.shopping.util; package com.myoa.engineering.crawl.shopping.util;
import org.springframework.lang.Nullable;
import java.time.*; import java.time.*;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
public final class DateTimeUtils { public final class DateTimeUtils {
private static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss"); public static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss");
public static final DateTimeFormatter FORMATTER_YYMMDD_SLASH = DateTimeFormatter.ofPattern("yy/MM/dd");
public static final DateTimeFormatter FORMATTER_YYMMDD_DOT = DateTimeFormatter.ofPattern("yyyy.MM.dd");
public static final DateTimeFormatter FORMATTER_HHMM = DateTimeFormatter.ofPattern("HH:mm");
private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul"); private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul");
private DateTimeUtils() { private DateTimeUtils() {
} }
public static ZonedDateTime parse(String HHMMss) { public static ZonedDateTime parse(String dateTimeString, DateTimeFormatter formatter, @Nullable DateTimeFormatter fallback) {
try { try {
LocalTime time = LocalTime.parse(HHMMss, FORMATTER_HHMMss); LocalTime time = LocalTime.parse(dateTimeString, formatter);
LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time); LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time);
if (dateTime.isAfter(LocalDateTime.now())) { if (dateTime.isAfter(LocalDateTime.now())) {
dateTime = dateTime.minusDays(1); dateTime = dateTime.minusDays(1);
} }
return dateTime.atZone(ZONE_ASIA_SEOUL); return dateTime.atZone(ZONE_ASIA_SEOUL);
} catch (Exception e) { } catch (Exception ignored) {
return null;
} }
try {
LocalDate date = LocalDate.parse(dateTimeString, fallback);
return date.atStartOfDay(ZONE_ASIA_SEOUL);
} catch (Exception ignored) {
}
return null;
} }
} }

View File

@ -0,0 +1,31 @@
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
import com.myoa.engineering.crawl.shopping.util.TestDataUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.util.List;
class FmkoreaArticleParserTest {
private FmkoreaArticleParser sut;
@BeforeEach
void setUp() {
sut = new FmkoreaArticleParser();
}
@Test
void parse() {
// given
String boardHtml = TestDataUtils.fileToString("testdata/fmkorea/file1.html");
// when
List<Article> actual = sut.parse(boardHtml);
// then
Assertions.assertEquals(20, actual.size());
}
}

File diff suppressed because it is too large Load Diff

View File

@ -7,8 +7,8 @@ import lombok.Getter;
@AllArgsConstructor @AllArgsConstructor
public enum CrawlTarget { public enum CrawlTarget {
PPOMPPU_DOMESTIC("뽐뿌국내", true), PPOMPPU_DOMESTIC("뽐뿌국내", true),
PPOMPPU_OVERSEA("뽐뿌해외", false), PPOMPPU_OVERSEA("뽐뿌해외", true),
FMKOREA("펨코", false), FMKOREA("펨코", true),
; ;
private final String alias; private final String alias;