From 86fa1cbe09093d14b364dfa2cf92265103c33ab0 Mon Sep 17 00:00:00 2001 From: "woozu.shin" Date: Sun, 26 Sep 2021 22:22:30 +0900 Subject: [PATCH] [PPN-210926-6] Persist feed articles --- .../controller/CrawlAPIController.java | 14 +++-- .../processor/domain/PpomppuArticle.java | 7 ++- .../domain/PpomppuBoardFeedStatus.java | 24 +++++++++ .../processor/domain/SubscribedBoard.java | 3 ++ .../dto/PpomppuArticleTransformer.java | 3 +- .../client/PpomppuBoardFeedRetriever.java | 7 ++- .../PpomppuBoardFeedStatusRepository.java | 4 ++ .../service/PpomppuArticleService.java | 35 ++++++++++-- .../processor/service/PpomppuFeedService.java | 7 ++- .../support/dto/code/PpomppuBoardName.java | 54 +++++++++++-------- 10 files changed, 120 insertions(+), 38 deletions(-) diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/controller/CrawlAPIController.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/controller/CrawlAPIController.java index 99a2ff3..3592278 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/controller/CrawlAPIController.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/controller/CrawlAPIController.java @@ -1,16 +1,17 @@ package com.myoa.engineering.crawl.ppomppu.processor.controller; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; -import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse; import com.myoa.engineering.crawl.ppomppu.processor.dto.FeedParsedResult; +import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuArticleService; import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuFeedService; +import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; +import java.util.List; import lombok.extern.slf4j.Slf4j; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; -import reactor.core.publisher.Flux; import reactor.core.publisher.Mono; /** @@ -25,16 +26,21 @@ import reactor.core.publisher.Mono; public class CrawlAPIController { private final PpomppuFeedService ppomppuRSSFeedService; + private final PpomppuArticleService ppomppuArticleService; - public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService) { + public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService, + PpomppuArticleService ppomppuArticleService) { this.ppomppuRSSFeedService = ppomppuRSSFeedService; + this.ppomppuArticleService = ppomppuArticleService; } @PostMapping("/boards/{boardName}") public Mono> crawlBoard(@PathVariable("boardName") PpomppuBoardName boardName) { log.info("got request... {}", boardName); FeedParsedResult result = FeedParsedResult.of(boardName); - Flux articles = ppomppuRSSFeedService.getArticles(boardName); + Mono> articles = ppomppuRSSFeedService.getArticles(boardName) + .doOnNext(e -> ppomppuArticleService.filterOnlyNewArticles(boardName, e)) + .doOnNext(e -> ppomppuArticleService.save(boardName, e)); return articles.then(Mono.just(APIResponse.success(result.done()))); } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuArticle.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuArticle.java index e42c085..9d88870 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuArticle.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuArticle.java @@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import java.time.Instant; import javax.persistence.Column; import javax.persistence.Entity; +import javax.persistence.EnumType; import javax.persistence.Enumerated; import javax.persistence.GeneratedValue; import javax.persistence.GenerationType; @@ -27,7 +28,7 @@ public class PpomppuArticle extends Auditable { private Long articleId; @Column - @Enumerated + @Enumerated(EnumType.STRING) private PpomppuBoardName boardName; @Column @@ -58,4 +59,8 @@ public class PpomppuArticle extends Auditable { this.registeredAt = registeredAt; } + public PpomppuArticle updateBoardName(PpomppuBoardName boardName) { + this.boardName = boardName; + return this; + } } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuBoardFeedStatus.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuBoardFeedStatus.java index ceda0da..b751c13 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuBoardFeedStatus.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/PpomppuBoardFeedStatus.java @@ -4,10 +4,13 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import java.time.Instant; import javax.persistence.Column; import javax.persistence.Entity; +import javax.persistence.EnumType; +import javax.persistence.Enumerated; import javax.persistence.GeneratedValue; import javax.persistence.GenerationType; import javax.persistence.Id; import javax.persistence.Table; +import lombok.Builder; import lombok.Getter; import lombok.NoArgsConstructor; @@ -25,9 +28,30 @@ public class PpomppuBoardFeedStatus extends Auditable { private Long latestParsedArticleId; @Column + @Enumerated(EnumType.STRING) private PpomppuBoardName boardName; @Column private Instant updatedAt; + public static PpomppuBoardFeedStatus of(PpomppuBoardName boardName, Long latestArticleId) { + return PpomppuBoardFeedStatus.builder() + .boardName(boardName) + .latestParsedArticleId(latestArticleId) + .updatedAt(Instant.now()) + .build(); + } + + public void updateArticleId(Long latestArticleId) { + this.updatedAt = Instant.now(); + this.latestParsedArticleId = latestArticleId; + } + + @Builder + public PpomppuBoardFeedStatus(Long id, Long latestParsedArticleId, PpomppuBoardName boardName, Instant updatedAt) { + this.id = id; + this.latestParsedArticleId = latestParsedArticleId; + this.boardName = boardName; + this.updatedAt = updatedAt; + } } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/SubscribedBoard.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/SubscribedBoard.java index 5aa7032..2eb7565 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/SubscribedBoard.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/domain/SubscribedBoard.java @@ -3,6 +3,8 @@ package com.myoa.engineering.crawl.ppomppu.processor.domain; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import javax.persistence.Column; import javax.persistence.Entity; +import javax.persistence.EnumType; +import javax.persistence.Enumerated; import javax.persistence.GeneratedValue; import javax.persistence.GenerationType; import javax.persistence.Id; @@ -24,6 +26,7 @@ public class SubscribedBoard extends Auditable { private Long userId; @Column + @Enumerated(EnumType.STRING) private PpomppuBoardName boardName; } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/dto/PpomppuArticleTransformer.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/dto/PpomppuArticleTransformer.java index ea00ee8..18bd30f 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/dto/PpomppuArticleTransformer.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/dto/PpomppuArticleTransformer.java @@ -1,6 +1,7 @@ package com.myoa.engineering.crawl.ppomppu.processor.dto; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; +import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -47,7 +48,7 @@ public final class PpomppuArticleTransformer { } public static String toArticleUrl(Element td) { - return td.getElementsByTag("a").attr("href"); + return PpomppuBoardName.ofViewPageUrl(td.getElementsByTag("a").attr("href")); } public static Integer toRecommended(Element td) { diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/client/PpomppuBoardFeedRetriever.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/client/PpomppuBoardFeedRetriever.java index 075adf8..a635288 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/client/PpomppuBoardFeedRetriever.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/client/PpomppuBoardFeedRetriever.java @@ -2,6 +2,7 @@ package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client; import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebClientFilterFactory; import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebFluxExchangeStragiesFactory; +import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import org.springframework.web.reactive.function.client.WebClient; @@ -19,12 +20,10 @@ import reactor.core.scheduler.Schedulers; @Component public class PpomppuBoardFeedRetriever { - private static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/"; - private final WebClient webClient; public PpomppuBoardFeedRetriever(WebClient.Builder webClientBuilder) { - this.webClient = webClientBuilder.baseUrl(PPOMPPU_URL) + this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL) .exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml()) .filter(WebClientFilterFactory.logRequest()) .filter(WebClientFilterFactory.logResponse()) @@ -33,7 +32,7 @@ public class PpomppuBoardFeedRetriever { public Mono getHtml(String uri) { return webClient.get() - .uri("/zboard/zboard.php?id=ppomppu") + .uri(uri) .exchangeToMono(e -> e.bodyToMono(String.class)) .publishOn(Schedulers.boundedElastic()) .onErrorResume(WebClientRequestException.class, t -> { diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/repository/PpomppuBoardFeedStatusRepository.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/repository/PpomppuBoardFeedStatusRepository.java index cf1d84c..e84a077 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/repository/PpomppuBoardFeedStatusRepository.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/infrastructure/repository/PpomppuBoardFeedStatusRepository.java @@ -1,10 +1,14 @@ package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus; +import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; +import java.util.Optional; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.stereotype.Repository; @Repository public interface PpomppuBoardFeedStatusRepository extends JpaRepository { + Optional findByBoardName(PpomppuBoardName boardName); + } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuArticleService.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuArticleService.java index 4b68630..632e3c3 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuArticleService.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuArticleService.java @@ -1,11 +1,16 @@ package com.myoa.engineering.crawl.ppomppu.processor.service; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; +import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuArticleRepository; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuBoardFeedStatusRepository; +import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; @Slf4j @Service @@ -21,10 +26,34 @@ public class PpomppuArticleService { this.ppomppuBoardFeedStatusRepository = ppomppuBoardFeedStatusRepository; } - public void save(List articles) { - // TODO get latest parsed article id - // TODO filter articles + @Transactional(readOnly = true) + public List filterOnlyNewArticles(PpomppuBoardName boardName, List articles) { + Optional boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName); + Long latestArticleId = boardFeedStatus.map(PpomppuBoardFeedStatus::getLatestParsedArticleId) + .orElse(0L); + return articles.stream() + .filter(e -> e.getArticleId().compareTo(latestArticleId) > 0) + .collect(Collectors.toList()); + } + + @Transactional + public void save(PpomppuBoardName boardName, List articles) { + Long latestArticleId = articles.stream() + .map(PpomppuArticle::getArticleId) + .max(Long::compareTo) + .orElse(0L); + + // save PpomppuBoardFeedStatus + Optional boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName); + boardFeedStatus.ifPresentOrElse(e -> { + e.updateArticleId(latestArticleId); + ppomppuBoardFeedStatusRepository.save(e); + }, + () -> ppomppuBoardFeedStatusRepository.save(PpomppuBoardFeedStatus.of(boardName, + latestArticleId))); + + // save real articles. ppomppuArticleRepository.saveAll(articles); } } diff --git a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuFeedService.java b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuFeedService.java index 0f64392..461bd2f 100644 --- a/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuFeedService.java +++ b/processor/src/main/java/com/myoa/engineering/crawl/ppomppu/processor/service/PpomppuFeedService.java @@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; import com.myoa.engineering.crawl.ppomppu.processor.dto.PpomppuArticleTransformer; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client.PpomppuBoardFeedRetriever; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; +import java.util.List; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; @@ -27,12 +28,14 @@ public class PpomppuFeedService { this.ppomppuBoardFeedRetriever = ppomppuBoardFeedRetriever; } - public Flux getArticles(PpomppuBoardName boardName) { + public Mono> getArticles(PpomppuBoardName boardName) { final Mono html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath()); final Mono tbody = extractTbodyFromHtml(html) .doOnNext(e -> log.info("pre tbody - {}", e.html())); return extractArticlesFromTbody(tbody).map(this::convertFromElement) - .doOnNext(e -> log.info("parsed Result: {}", e)); + .map(e -> e.updateBoardName(boardName)) + .doOnNext(e -> log.info("parsed Result: {}", e)) + .collectList(); } private Mono extractTbodyFromHtml(Mono html) { diff --git a/support/src/main/java/com/myoa/engineering/crawl/ppomppu/support/dto/code/PpomppuBoardName.java b/support/src/main/java/com/myoa/engineering/crawl/ppomppu/support/dto/code/PpomppuBoardName.java index e254e84..eca19fe 100644 --- a/support/src/main/java/com/myoa/engineering/crawl/ppomppu/support/dto/code/PpomppuBoardName.java +++ b/support/src/main/java/com/myoa/engineering/crawl/ppomppu/support/dto/code/PpomppuBoardName.java @@ -12,34 +12,42 @@ import lombok.NoArgsConstructor; @Getter @NoArgsConstructor public enum PpomppuBoardName { - PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1"), - PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4"), - PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5"), - PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6"), - PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8"), - PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9"), - PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10"), - PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11"), - PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12"), - PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13"), - PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15"), - PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1"), - PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7"), - PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8"), - PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3"), - PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4"), - PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9"), - PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5"), - PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2"), - PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11"), - PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10"), - PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6"), + PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1", true), + PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4", true), + PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5", true), + PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6", true), + PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8", true), + PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9", true), + PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10", true), + PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11", true), + PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12", true), + PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13", true), + PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15", true), + PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1", true), + PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7", true), + PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8", true), + PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3", true), + PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4", true), + PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9", true), + PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5", true), + PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2", true), + PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11", true), + PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10", true), + PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6", true), ; private String resourcePath; + private boolean crawlWithDefaultTimer; - PpomppuBoardName(String boardPath) { + PpomppuBoardName(String boardPath, boolean crawlWithDefaultTimer) { this.resourcePath = boardPath; + this.crawlWithDefaultTimer = crawlWithDefaultTimer; + } + + public static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/"; + + public static String ofViewPageUrl(String articleUrl) { + return PPOMPPU_URL + "zboard/" + articleUrl; } }