[PPN-210926-6] Persist feed articles #7

Merged
nthfuncx merged 1 commits from feature/PPN-210926-6 into feature/PPN-210926-5 2021-09-26 13:24:33 +00:00
10 changed files with 120 additions and 38 deletions

View File

@ -1,16 +1,17 @@
package com.myoa.engineering.crawl.ppomppu.processor.controller; package com.myoa.engineering.crawl.ppomppu.processor.controller;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse;
import com.myoa.engineering.crawl.ppomppu.processor.dto.FeedParsedResult; import com.myoa.engineering.crawl.ppomppu.processor.dto.FeedParsedResult;
import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuArticleService;
import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuFeedService; import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuFeedService;
import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono; import reactor.core.publisher.Mono;
/** /**
@ -25,16 +26,21 @@ import reactor.core.publisher.Mono;
public class CrawlAPIController { public class CrawlAPIController {
private final PpomppuFeedService ppomppuRSSFeedService; private final PpomppuFeedService ppomppuRSSFeedService;
private final PpomppuArticleService ppomppuArticleService;
public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService) { public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService,
PpomppuArticleService ppomppuArticleService) {
this.ppomppuRSSFeedService = ppomppuRSSFeedService; this.ppomppuRSSFeedService = ppomppuRSSFeedService;
this.ppomppuArticleService = ppomppuArticleService;
} }
@PostMapping("/boards/{boardName}") @PostMapping("/boards/{boardName}")
public Mono<APIResponse<FeedParsedResult>> crawlBoard(@PathVariable("boardName") PpomppuBoardName boardName) { public Mono<APIResponse<FeedParsedResult>> crawlBoard(@PathVariable("boardName") PpomppuBoardName boardName) {
log.info("got request... {}", boardName); log.info("got request... {}", boardName);
FeedParsedResult result = FeedParsedResult.of(boardName); FeedParsedResult result = FeedParsedResult.of(boardName);
Flux<PpomppuArticle> articles = ppomppuRSSFeedService.getArticles(boardName); Mono<List<PpomppuArticle>> articles = ppomppuRSSFeedService.getArticles(boardName)
.doOnNext(e -> ppomppuArticleService.filterOnlyNewArticles(boardName, e))
.doOnNext(e -> ppomppuArticleService.save(boardName, e));
return articles.then(Mono.just(APIResponse.success(result.done()))); return articles.then(Mono.just(APIResponse.success(result.done())));
} }

View File

@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant; import java.time.Instant;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated; import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue; import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType; import javax.persistence.GenerationType;
@ -27,7 +28,7 @@ public class PpomppuArticle extends Auditable {
private Long articleId; private Long articleId;
@Column @Column
@Enumerated @Enumerated(EnumType.STRING)
private PpomppuBoardName boardName; private PpomppuBoardName boardName;
@Column @Column
@ -58,4 +59,8 @@ public class PpomppuArticle extends Auditable {
this.registeredAt = registeredAt; this.registeredAt = registeredAt;
} }
public PpomppuArticle updateBoardName(PpomppuBoardName boardName) {
this.boardName = boardName;
return this;
}
} }

View File

@ -4,10 +4,13 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant; import java.time.Instant;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue; import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType; import javax.persistence.GenerationType;
import javax.persistence.Id; import javax.persistence.Id;
import javax.persistence.Table; import javax.persistence.Table;
import lombok.Builder;
import lombok.Getter; import lombok.Getter;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@ -25,9 +28,30 @@ public class PpomppuBoardFeedStatus extends Auditable {
private Long latestParsedArticleId; private Long latestParsedArticleId;
@Column @Column
@Enumerated(EnumType.STRING)
private PpomppuBoardName boardName; private PpomppuBoardName boardName;
@Column @Column
private Instant updatedAt; private Instant updatedAt;
public static PpomppuBoardFeedStatus of(PpomppuBoardName boardName, Long latestArticleId) {
return PpomppuBoardFeedStatus.builder()
.boardName(boardName)
.latestParsedArticleId(latestArticleId)
.updatedAt(Instant.now())
.build();
}
public void updateArticleId(Long latestArticleId) {
this.updatedAt = Instant.now();
this.latestParsedArticleId = latestArticleId;
}
@Builder
public PpomppuBoardFeedStatus(Long id, Long latestParsedArticleId, PpomppuBoardName boardName, Instant updatedAt) {
this.id = id;
this.latestParsedArticleId = latestParsedArticleId;
this.boardName = boardName;
this.updatedAt = updatedAt;
}
} }

View File

@ -3,6 +3,8 @@ package com.myoa.engineering.crawl.ppomppu.processor.domain;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue; import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType; import javax.persistence.GenerationType;
import javax.persistence.Id; import javax.persistence.Id;
@ -24,6 +26,7 @@ public class SubscribedBoard extends Auditable {
private Long userId; private Long userId;
@Column @Column
@Enumerated(EnumType.STRING)
private PpomppuBoardName boardName; private PpomppuBoardName boardName;
} }

View File

@ -1,6 +1,7 @@
package com.myoa.engineering.crawl.ppomppu.processor.dto; package com.myoa.engineering.crawl.ppomppu.processor.dto;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant; import java.time.Instant;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
@ -47,7 +48,7 @@ public final class PpomppuArticleTransformer {
} }
public static String toArticleUrl(Element td) { public static String toArticleUrl(Element td) {
return td.getElementsByTag("a").attr("href"); return PpomppuBoardName.ofViewPageUrl(td.getElementsByTag("a").attr("href"));
} }
public static Integer toRecommended(Element td) { public static Integer toRecommended(Element td) {

View File

@ -2,6 +2,7 @@ package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebClientFilterFactory; import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebClientFilterFactory;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebFluxExchangeStragiesFactory; import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebFluxExchangeStragiesFactory;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import org.springframework.web.reactive.function.client.WebClient; import org.springframework.web.reactive.function.client.WebClient;
@ -19,12 +20,10 @@ import reactor.core.scheduler.Schedulers;
@Component @Component
public class PpomppuBoardFeedRetriever { public class PpomppuBoardFeedRetriever {
private static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/";
private final WebClient webClient; private final WebClient webClient;
public PpomppuBoardFeedRetriever(WebClient.Builder webClientBuilder) { public PpomppuBoardFeedRetriever(WebClient.Builder webClientBuilder) {
this.webClient = webClientBuilder.baseUrl(PPOMPPU_URL) this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL)
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml()) .exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
.filter(WebClientFilterFactory.logRequest()) .filter(WebClientFilterFactory.logRequest())
.filter(WebClientFilterFactory.logResponse()) .filter(WebClientFilterFactory.logResponse())
@ -33,7 +32,7 @@ public class PpomppuBoardFeedRetriever {
public Mono<String> getHtml(String uri) { public Mono<String> getHtml(String uri) {
return webClient.get() return webClient.get()
.uri("/zboard/zboard.php?id=ppomppu") .uri(uri)
.exchangeToMono(e -> e.bodyToMono(String.class)) .exchangeToMono(e -> e.bodyToMono(String.class))
.publishOn(Schedulers.boundedElastic()) .publishOn(Schedulers.boundedElastic())
.onErrorResume(WebClientRequestException.class, t -> { .onErrorResume(WebClientRequestException.class, t -> {

View File

@ -1,10 +1,14 @@
package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository; package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.Optional;
import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository; import org.springframework.stereotype.Repository;
@Repository @Repository
public interface PpomppuBoardFeedStatusRepository extends JpaRepository<PpomppuBoardFeedStatus, Long> { public interface PpomppuBoardFeedStatusRepository extends JpaRepository<PpomppuBoardFeedStatus, Long> {
Optional<PpomppuBoardFeedStatus> findByBoardName(PpomppuBoardName boardName);
} }

View File

@ -1,11 +1,16 @@
package com.myoa.engineering.crawl.ppomppu.processor.service; package com.myoa.engineering.crawl.ppomppu.processor.service;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuArticleRepository; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuArticleRepository;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuBoardFeedStatusRepository; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuBoardFeedStatusRepository;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@Slf4j @Slf4j
@Service @Service
@ -21,10 +26,34 @@ public class PpomppuArticleService {
this.ppomppuBoardFeedStatusRepository = ppomppuBoardFeedStatusRepository; this.ppomppuBoardFeedStatusRepository = ppomppuBoardFeedStatusRepository;
} }
public void save(List<PpomppuArticle> articles) { @Transactional(readOnly = true)
// TODO get latest parsed article id public List<PpomppuArticle> filterOnlyNewArticles(PpomppuBoardName boardName, List<PpomppuArticle> articles) {
// TODO filter articles Optional<PpomppuBoardFeedStatus> boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName);
Long latestArticleId = boardFeedStatus.map(PpomppuBoardFeedStatus::getLatestParsedArticleId)
.orElse(0L);
return articles.stream()
.filter(e -> e.getArticleId().compareTo(latestArticleId) > 0)
.collect(Collectors.toList());
}
@Transactional
public void save(PpomppuBoardName boardName, List<PpomppuArticle> articles) {
Long latestArticleId = articles.stream()
.map(PpomppuArticle::getArticleId)
.max(Long::compareTo)
.orElse(0L);
// save PpomppuBoardFeedStatus
Optional<PpomppuBoardFeedStatus> boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName);
boardFeedStatus.ifPresentOrElse(e -> {
e.updateArticleId(latestArticleId);
ppomppuBoardFeedStatusRepository.save(e);
},
() -> ppomppuBoardFeedStatusRepository.save(PpomppuBoardFeedStatus.of(boardName,
latestArticleId)));
// save real articles.
ppomppuArticleRepository.saveAll(articles); ppomppuArticleRepository.saveAll(articles);
} }
} }

View File

@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.processor.dto.PpomppuArticleTransformer; import com.myoa.engineering.crawl.ppomppu.processor.dto.PpomppuArticleTransformer;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client.PpomppuBoardFeedRetriever; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client.PpomppuBoardFeedRetriever;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -27,12 +28,14 @@ public class PpomppuFeedService {
this.ppomppuBoardFeedRetriever = ppomppuBoardFeedRetriever; this.ppomppuBoardFeedRetriever = ppomppuBoardFeedRetriever;
} }
public Flux<PpomppuArticle> getArticles(PpomppuBoardName boardName) { public Mono<List<PpomppuArticle>> getArticles(PpomppuBoardName boardName) {
final Mono<String> html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath()); final Mono<String> html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath());
final Mono<Element> tbody = extractTbodyFromHtml(html) final Mono<Element> tbody = extractTbodyFromHtml(html)
.doOnNext(e -> log.info("pre tbody - {}", e.html())); .doOnNext(e -> log.info("pre tbody - {}", e.html()));
return extractArticlesFromTbody(tbody).map(this::convertFromElement) return extractArticlesFromTbody(tbody).map(this::convertFromElement)
.doOnNext(e -> log.info("parsed Result: {}", e)); .map(e -> e.updateBoardName(boardName))
.doOnNext(e -> log.info("parsed Result: {}", e))
.collectList();
} }
private Mono<Element> extractTbodyFromHtml(Mono<String> html) { private Mono<Element> extractTbodyFromHtml(Mono<String> html) {

View File

@ -12,34 +12,42 @@ import lombok.NoArgsConstructor;
@Getter @Getter
@NoArgsConstructor @NoArgsConstructor
public enum PpomppuBoardName { public enum PpomppuBoardName {
PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1"), PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1", true),
PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4"), PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4", true),
PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5"), PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5", true),
PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6"), PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6", true),
PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8"), PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8", true),
PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9"), PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9", true),
PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10"), PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10", true),
PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11"), PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11", true),
PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12"), PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12", true),
PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13"), PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13", true),
PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15"), PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15", true),
PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1"), PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1", true),
PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7"), PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7", true),
PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8"), PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8", true),
PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3"), PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3", true),
PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4"), PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4", true),
PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9"), PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9", true),
PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5"), PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5", true),
PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2"), PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2", true),
PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11"), PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11", true),
PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10"), PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10", true),
PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6"), PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6", true),
; ;
private String resourcePath; private String resourcePath;
private boolean crawlWithDefaultTimer;
PpomppuBoardName(String boardPath) { PpomppuBoardName(String boardPath, boolean crawlWithDefaultTimer) {
this.resourcePath = boardPath; this.resourcePath = boardPath;
this.crawlWithDefaultTimer = crawlWithDefaultTimer;
}
public static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/";
public static String ofViewPageUrl(String articleUrl) {
return PPOMPPU_URL + "zboard/" + articleUrl;
} }
} }