Implement remained CrawlHandler
This commit is contained in:
parent
8502b95a7d
commit
365b15e553
|
@ -1,12 +1,33 @@
|
|||
package com.myoa.engineering.crawl.shopping.crawlhandler;
|
||||
|
||||
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.FmkoreaArticleParser;
|
||||
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||
import com.myoa.engineering.crawl.shopping.infra.client.fmkorea.FmkoreaBoardClient;
|
||||
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
|
||||
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class FmkoreaCrawlHandler implements CrawlHandler {
|
||||
|
||||
private final FmkoreaBoardClient fmkoreaBoardClient;
|
||||
private final FmkoreaArticleParser fmkoreaArticleParser;
|
||||
private final ArticleCommandService articleCommandService;
|
||||
|
||||
public FmkoreaCrawlHandler(FmkoreaBoardClient fmkoreaBoardClient,
|
||||
FmkoreaArticleParser fmkoreaArticleParser, ArticleCommandService articleCommandService) {
|
||||
this.fmkoreaBoardClient = fmkoreaBoardClient;
|
||||
this.fmkoreaArticleParser = fmkoreaArticleParser;
|
||||
this.articleCommandService = articleCommandService;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CrawlTarget getCrawlTarget() {
|
||||
return CrawlTarget.FMKOREA;
|
||||
|
@ -14,5 +35,25 @@ public class FmkoreaCrawlHandler implements CrawlHandler {
|
|||
|
||||
@Override
|
||||
public void handle() {
|
||||
|
||||
String boardHtmlPage1 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(1));
|
||||
List<Article> parsedPage1 = fmkoreaArticleParser.parse(boardHtmlPage1);
|
||||
|
||||
String boardHtmlPage2 = fmkoreaBoardClient.getBoardHtml("/index.php", generateRequestParams(2));
|
||||
List<Article> parsedPage2 = fmkoreaArticleParser.parse(boardHtmlPage2);
|
||||
|
||||
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
|
||||
.flatMap(List::stream)
|
||||
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
|
||||
.toList();
|
||||
|
||||
articleCommandService.upsert(merged);
|
||||
}
|
||||
|
||||
private Map<String, String> generateRequestParams(int pageId) {
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("mid", "hotdeal");
|
||||
params.put("page", String.valueOf(pageId));
|
||||
return params;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package com.myoa.engineering.crawl.shopping.crawlhandler;
|
||||
|
||||
import com.myoa.engineering.crawl.shopping.crawlhandler.parser.PpomppuArticleParserV2;
|
||||
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||
import com.myoa.engineering.crawl.shopping.infra.client.ppomppu.PpomppuBoardClientV2;
|
||||
import com.myoa.engineering.crawl.shopping.service.ArticleCommandService;
|
||||
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Component
|
||||
public class PpomppuCrawlOverseaHandler implements CrawlHandler {
|
||||
|
||||
private final PpomppuBoardClientV2 ppomppuBoardClient;
|
||||
private final PpomppuArticleParserV2 ppomppuArticleParserV2;
|
||||
private final ArticleCommandService articleCommandService;
|
||||
|
||||
public PpomppuCrawlOverseaHandler(PpomppuBoardClientV2 ppomppuBoardClient,
|
||||
PpomppuArticleParserV2 ppomppuArticleParserV2,
|
||||
ArticleCommandService articleCommandService) {
|
||||
this.ppomppuBoardClient = ppomppuBoardClient;
|
||||
this.ppomppuArticleParserV2 = ppomppuArticleParserV2;
|
||||
this.articleCommandService = articleCommandService;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CrawlTarget getCrawlTarget() {
|
||||
return CrawlTarget.PPOMPPU_OVERSEA;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle() {
|
||||
|
||||
String boardHtmlPage1 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(1));
|
||||
List<Article> parsedPage1 = ppomppuArticleParserV2.parse(boardHtmlPage1);
|
||||
|
||||
String boardHtmlPage2 = ppomppuBoardClient.getBoardHtml("/zboard/zboard.php", generateRequestParams(2));
|
||||
List<Article> parsedPage2 = ppomppuArticleParserV2.parse(boardHtmlPage2);
|
||||
|
||||
List<Article> merged = Stream.of(parsedPage1, parsedPage2)
|
||||
.flatMap(List::stream)
|
||||
.map(e -> e.updateCrawlTarget(getCrawlTarget()))
|
||||
.toList();
|
||||
|
||||
articleCommandService.upsert(merged);
|
||||
}
|
||||
|
||||
private Map<String, String> generateRequestParams(int pageId) {
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("id", "ppomppu4");
|
||||
params.put("page", String.valueOf(pageId));
|
||||
return params;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
|
||||
|
||||
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||
import com.myoa.engineering.crawl.shopping.support.dto.constant.CrawlTarget;
|
||||
import com.myoa.engineering.crawl.shopping.util.DateTimeUtils;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class FmkoreaArticleParser {
|
||||
|
||||
private static final String FMKOREA_URL = "https://www.fmkorea.com";
|
||||
|
||||
public List<Article> parse(String html) {
|
||||
Elements liElements = converHtmlToTrElements(html);
|
||||
return liElements.stream()
|
||||
// .filter(this::isRealArticle)
|
||||
.map(this::parse)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private Elements converHtmlToTrElements(String html) {
|
||||
Document document = Jsoup.parse(html);
|
||||
Element liTable = document.getElementsByClass("fm_best_widget").first();
|
||||
return liTable.select("li.li_best2_pop0");
|
||||
}
|
||||
|
||||
private Article parse(Element item) {
|
||||
String link = item.select("h3.title a").attr("href");
|
||||
Long articleId = Long.parseLong(link.replace("/", ""));
|
||||
|
||||
// https://www.fmkorea.com/7023440365
|
||||
String articleUrl = FMKOREA_URL + link;
|
||||
String boardName = item.select("span.category a").text().trim();
|
||||
|
||||
String title = item.select("h3.title a").text();
|
||||
String itemPrice = item.select("div.hotdeal_info span:contains(가격) a").text();
|
||||
String deliveryPrice = item.select("div.hotdeal_info span:contains(배송) a").text();
|
||||
title = title + " 가격: " + itemPrice + " 배송: " + deliveryPrice;
|
||||
String registeredAtString = item.select("span.regdate").text().trim();
|
||||
ZonedDateTime registeredAt = DateTimeUtils.parse(registeredAtString, DateTimeUtils.FORMATTER_HHMM, DateTimeUtils.FORMATTER_YYMMDD_DOT);
|
||||
|
||||
Element recommendationElement = item.selectFirst("a.pc_voted_count");
|
||||
Integer recommended = null;
|
||||
if (recommendationElement != null) {
|
||||
recommended = Integer.parseInt(recommendationElement.selectFirst("span.count").text());
|
||||
}
|
||||
|
||||
return Article.builder()
|
||||
.articleId(articleId)
|
||||
.title(title)
|
||||
.boardName(boardName)
|
||||
.articleUrl(articleUrl)
|
||||
.recommended(recommended)
|
||||
.registeredAt(registeredAt)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -21,14 +21,11 @@ import java.util.regex.Pattern;
|
|||
|
||||
@Slf4j
|
||||
@Component
|
||||
public final class PpomppuArticleParserV2 {
|
||||
public class PpomppuArticleParserV2 {
|
||||
|
||||
private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yy.MM.dd HH:mm:ss")
|
||||
.withZone(ZoneId.of("Asia/Seoul"));
|
||||
|
||||
private PpomppuArticleParserV2() {
|
||||
}
|
||||
|
||||
public List<Article> parse(String html) {
|
||||
Elements trElements = converHtmlToTrElements(html);
|
||||
return trElements.stream()
|
||||
|
@ -38,8 +35,8 @@ public final class PpomppuArticleParserV2 {
|
|||
|
||||
}
|
||||
|
||||
private Elements converHtmlToTrElements(String data) {
|
||||
Document document = Jsoup.parse(data);
|
||||
private Elements converHtmlToTrElements(String html) {
|
||||
Document document = Jsoup.parse(html);
|
||||
Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr");
|
||||
return trList;
|
||||
}
|
||||
|
@ -72,7 +69,7 @@ public final class PpomppuArticleParserV2 {
|
|||
String boardName = parseBoardName(title);
|
||||
Integer recommended = parseRecommended(tdList.get(4));
|
||||
Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0);
|
||||
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text());
|
||||
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text(), DateTimeUtils.FORMATTER_HHMMss, DateTimeUtils.FORMATTER_YYMMDD_SLASH);
|
||||
|
||||
return Article.builder()
|
||||
.articleId(articleId)
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
package com.myoa.engineering.crawl.shopping.infra.client.fmkorea;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.cloud.openfeign.SpringQueryMap;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
|
||||
@FeignClient(value = "fmkorea-board-client", url = "https://fmkorea.com")
|
||||
import java.util.Map;
|
||||
|
||||
@FeignClient(value = "fmkorea-board-client", url = "https://www.fmkorea.com")
|
||||
public interface FmkoreaBoardClient {
|
||||
|
||||
@GetMapping("{boardLink}")
|
||||
String getBoardHtml(@PathVariable("boardLink") String boardLink);
|
||||
String getBoardHtml(@PathVariable("boardLink") String boardLink,
|
||||
@SpringQueryMap Map<String, String> params);
|
||||
}
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
package com.myoa.engineering.crawl.shopping.infra.client.ppomppu;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* PpomppuBoardFeedRetriever
|
||||
*
|
||||
* @author Shin Woo-jin (woozu.shin@kakaoent.com)
|
||||
* @since 2021-09-08
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
public class PpomppuBoardClient {
|
||||
/*
|
||||
private final WebClient webClient;
|
||||
|
||||
public PpomppuBoardClient(WebClient.Builder webClientBuilder) {
|
||||
this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL)
|
||||
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
|
||||
.filter(WebClientFilterFactory.logRequest())
|
||||
.filter(WebClientFilterFactory.logResponse())
|
||||
.build();
|
||||
}
|
||||
|
||||
public Mono<String> getHtml(String uri) {
|
||||
return webClient.get()
|
||||
.uri(uri)
|
||||
.exchangeToMono(e -> e.bodyToMono(String.class))
|
||||
.publishOn(Schedulers.boundedElastic())
|
||||
.onErrorResume(WebClientRequestException.class, t -> {
|
||||
log.info("Exception occured, ignoring. : {}", t.getClass().getSimpleName());
|
||||
return Mono.empty();
|
||||
});
|
||||
// .doOnNext(e -> log.info("[getHtml] {}", e));
|
||||
}
|
||||
*/
|
||||
}
|
|
@ -3,6 +3,7 @@ package com.myoa.engineering.crawl.shopping.scheduler;
|
|||
import com.myoa.engineering.crawl.shopping.crawlhandler.CrawlHandler;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -18,7 +19,7 @@ public class ParseEventEmitter {
|
|||
this.crawlHandlers = crawlHandlers;
|
||||
}
|
||||
|
||||
// @Scheduled(cron = "0 0/5 * * * ?")
|
||||
@Scheduled(cron = "0 0/5 * * * ?")
|
||||
public void emit() {
|
||||
log.info("[emitDomesticBoard] trigger fired!");
|
||||
crawlHandlers.forEach(CrawlHandler::handle);
|
||||
|
|
|
@ -1,27 +1,38 @@
|
|||
package com.myoa.engineering.crawl.shopping.util;
|
||||
|
||||
import org.springframework.lang.Nullable;
|
||||
|
||||
import java.time.*;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
||||
public final class DateTimeUtils {
|
||||
|
||||
private static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss");
|
||||
public static final DateTimeFormatter FORMATTER_HHMMss = DateTimeFormatter.ofPattern("HH:mm:ss");
|
||||
public static final DateTimeFormatter FORMATTER_YYMMDD_SLASH = DateTimeFormatter.ofPattern("yy/MM/dd");
|
||||
public static final DateTimeFormatter FORMATTER_YYMMDD_DOT = DateTimeFormatter.ofPattern("yyyy.MM.dd");
|
||||
public static final DateTimeFormatter FORMATTER_HHMM = DateTimeFormatter.ofPattern("HH:mm");
|
||||
private static final ZoneId ZONE_ASIA_SEOUL = ZoneId.of("Asia/Seoul");
|
||||
|
||||
private DateTimeUtils() {
|
||||
}
|
||||
|
||||
public static ZonedDateTime parse(String HHMMss) {
|
||||
public static ZonedDateTime parse(String dateTimeString, DateTimeFormatter formatter, @Nullable DateTimeFormatter fallback) {
|
||||
try {
|
||||
LocalTime time = LocalTime.parse(HHMMss, FORMATTER_HHMMss);
|
||||
LocalTime time = LocalTime.parse(dateTimeString, formatter);
|
||||
LocalDateTime dateTime = LocalDateTime.of(LocalDate.now(), time);
|
||||
if (dateTime.isAfter(LocalDateTime.now())) {
|
||||
dateTime = dateTime.minusDays(1);
|
||||
}
|
||||
return dateTime.atZone(ZONE_ASIA_SEOUL);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
try {
|
||||
LocalDate date = LocalDate.parse(dateTimeString, fallback);
|
||||
return date.atStartOfDay(ZONE_ASIA_SEOUL);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
|
||||
|
||||
import com.myoa.engineering.crawl.shopping.domain.entity.v2.Article;
|
||||
import com.myoa.engineering.crawl.shopping.util.TestDataUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class FmkoreaArticleParserTest {
|
||||
|
||||
private FmkoreaArticleParser sut;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
sut = new FmkoreaArticleParser();
|
||||
}
|
||||
|
||||
@Test
|
||||
void parse() {
|
||||
// given
|
||||
String boardHtml = TestDataUtils.fileToString("testdata/fmkorea/file1.html");
|
||||
|
||||
// when
|
||||
List<Article> actual = sut.parse(boardHtml);
|
||||
|
||||
// then
|
||||
Assertions.assertEquals(20, actual.size());
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -7,8 +7,8 @@ import lombok.Getter;
|
|||
@AllArgsConstructor
|
||||
public enum CrawlTarget {
|
||||
PPOMPPU_DOMESTIC("뽐뿌국내", true),
|
||||
PPOMPPU_OVERSEA("뽐뿌해외", false),
|
||||
FMKOREA("펨코", false),
|
||||
PPOMPPU_OVERSEA("뽐뿌해외", true),
|
||||
FMKOREA("펨코", true),
|
||||
;
|
||||
|
||||
private final String alias;
|
||||
|
|
Loading…
Reference in New Issue