[PPN-210926-6] Persist feed articles

This commit is contained in:
woozu.shin 2021-09-26 22:22:30 +09:00
parent ab4ab339f6
commit 86fa1cbe09
10 changed files with 120 additions and 38 deletions

View File

@ -1,16 +1,17 @@
package com.myoa.engineering.crawl.ppomppu.processor.controller;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse;
import com.myoa.engineering.crawl.ppomppu.processor.dto.FeedParsedResult;
import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuArticleService;
import com.myoa.engineering.crawl.ppomppu.processor.service.PpomppuFeedService;
import com.myoa.engineering.crawl.ppomppu.support.dto.APIResponse;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
/**
@ -25,16 +26,21 @@ import reactor.core.publisher.Mono;
public class CrawlAPIController {
private final PpomppuFeedService ppomppuRSSFeedService;
private final PpomppuArticleService ppomppuArticleService;
public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService) {
public CrawlAPIController(PpomppuFeedService ppomppuRSSFeedService,
PpomppuArticleService ppomppuArticleService) {
this.ppomppuRSSFeedService = ppomppuRSSFeedService;
this.ppomppuArticleService = ppomppuArticleService;
}
@PostMapping("/boards/{boardName}")
public Mono<APIResponse<FeedParsedResult>> crawlBoard(@PathVariable("boardName") PpomppuBoardName boardName) {
log.info("got request... {}", boardName);
FeedParsedResult result = FeedParsedResult.of(boardName);
Flux<PpomppuArticle> articles = ppomppuRSSFeedService.getArticles(boardName);
Mono<List<PpomppuArticle>> articles = ppomppuRSSFeedService.getArticles(boardName)
.doOnNext(e -> ppomppuArticleService.filterOnlyNewArticles(boardName, e))
.doOnNext(e -> ppomppuArticleService.save(boardName, e));
return articles.then(Mono.just(APIResponse.success(result.done())));
}

View File

@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
@ -27,7 +28,7 @@ public class PpomppuArticle extends Auditable {
private Long articleId;
@Column
@Enumerated
@Enumerated(EnumType.STRING)
private PpomppuBoardName boardName;
@Column
@ -58,4 +59,8 @@ public class PpomppuArticle extends Auditable {
this.registeredAt = registeredAt;
}
public PpomppuArticle updateBoardName(PpomppuBoardName boardName) {
this.boardName = boardName;
return this;
}
}

View File

@ -4,10 +4,13 @@ import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
@ -25,9 +28,30 @@ public class PpomppuBoardFeedStatus extends Auditable {
private Long latestParsedArticleId;
@Column
@Enumerated(EnumType.STRING)
private PpomppuBoardName boardName;
@Column
private Instant updatedAt;
public static PpomppuBoardFeedStatus of(PpomppuBoardName boardName, Long latestArticleId) {
return PpomppuBoardFeedStatus.builder()
.boardName(boardName)
.latestParsedArticleId(latestArticleId)
.updatedAt(Instant.now())
.build();
}
public void updateArticleId(Long latestArticleId) {
this.updatedAt = Instant.now();
this.latestParsedArticleId = latestArticleId;
}
@Builder
public PpomppuBoardFeedStatus(Long id, Long latestParsedArticleId, PpomppuBoardName boardName, Instant updatedAt) {
this.id = id;
this.latestParsedArticleId = latestParsedArticleId;
this.boardName = boardName;
this.updatedAt = updatedAt;
}
}

View File

@ -3,6 +3,8 @@ package com.myoa.engineering.crawl.ppomppu.processor.domain;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
@ -24,6 +26,7 @@ public class SubscribedBoard extends Auditable {
private Long userId;
@Column
@Enumerated(EnumType.STRING)
private PpomppuBoardName boardName;
}

View File

@ -1,6 +1,7 @@
package com.myoa.engineering.crawl.ppomppu.processor.dto;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
@ -47,7 +48,7 @@ public final class PpomppuArticleTransformer {
}
public static String toArticleUrl(Element td) {
return td.getElementsByTag("a").attr("href");
return PpomppuBoardName.ofViewPageUrl(td.getElementsByTag("a").attr("href"));
}
public static Integer toRecommended(Element td) {

View File

@ -2,6 +2,7 @@ package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebClientFilterFactory;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebFluxExchangeStragiesFactory;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.web.reactive.function.client.WebClient;
@ -19,12 +20,10 @@ import reactor.core.scheduler.Schedulers;
@Component
public class PpomppuBoardFeedRetriever {
private static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/";
private final WebClient webClient;
public PpomppuBoardFeedRetriever(WebClient.Builder webClientBuilder) {
this.webClient = webClientBuilder.baseUrl(PPOMPPU_URL)
this.webClient = webClientBuilder.baseUrl(PpomppuBoardName.PPOMPPU_URL)
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
.filter(WebClientFilterFactory.logRequest())
.filter(WebClientFilterFactory.logResponse())
@ -33,7 +32,7 @@ public class PpomppuBoardFeedRetriever {
public Mono<String> getHtml(String uri) {
return webClient.get()
.uri("/zboard/zboard.php?id=ppomppu")
.uri(uri)
.exchangeToMono(e -> e.bodyToMono(String.class))
.publishOn(Schedulers.boundedElastic())
.onErrorResume(WebClientRequestException.class, t -> {

View File

@ -1,10 +1,14 @@
package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.Optional;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface PpomppuBoardFeedStatusRepository extends JpaRepository<PpomppuBoardFeedStatus, Long> {
Optional<PpomppuBoardFeedStatus> findByBoardName(PpomppuBoardName boardName);
}

View File

@ -1,11 +1,16 @@
package com.myoa.engineering.crawl.ppomppu.processor.service;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuBoardFeedStatus;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuArticleRepository;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.repository.PpomppuBoardFeedStatusRepository;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@Slf4j
@Service
@ -21,10 +26,34 @@ public class PpomppuArticleService {
this.ppomppuBoardFeedStatusRepository = ppomppuBoardFeedStatusRepository;
}
public void save(List<PpomppuArticle> articles) {
// TODO get latest parsed article id
// TODO filter articles
@Transactional(readOnly = true)
public List<PpomppuArticle> filterOnlyNewArticles(PpomppuBoardName boardName, List<PpomppuArticle> articles) {
Optional<PpomppuBoardFeedStatus> boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName);
Long latestArticleId = boardFeedStatus.map(PpomppuBoardFeedStatus::getLatestParsedArticleId)
.orElse(0L);
return articles.stream()
.filter(e -> e.getArticleId().compareTo(latestArticleId) > 0)
.collect(Collectors.toList());
}
@Transactional
public void save(PpomppuBoardName boardName, List<PpomppuArticle> articles) {
Long latestArticleId = articles.stream()
.map(PpomppuArticle::getArticleId)
.max(Long::compareTo)
.orElse(0L);
// save PpomppuBoardFeedStatus
Optional<PpomppuBoardFeedStatus> boardFeedStatus = ppomppuBoardFeedStatusRepository.findByBoardName(boardName);
boardFeedStatus.ifPresentOrElse(e -> {
e.updateArticleId(latestArticleId);
ppomppuBoardFeedStatusRepository.save(e);
},
() -> ppomppuBoardFeedStatusRepository.save(PpomppuBoardFeedStatus.of(boardName,
latestArticleId)));
// save real articles.
ppomppuArticleRepository.saveAll(articles);
}
}

View File

@ -4,6 +4,7 @@ import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle;
import com.myoa.engineering.crawl.ppomppu.processor.dto.PpomppuArticleTransformer;
import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client.PpomppuBoardFeedRetriever;
import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
@ -27,12 +28,14 @@ public class PpomppuFeedService {
this.ppomppuBoardFeedRetriever = ppomppuBoardFeedRetriever;
}
public Flux<PpomppuArticle> getArticles(PpomppuBoardName boardName) {
public Mono<List<PpomppuArticle>> getArticles(PpomppuBoardName boardName) {
final Mono<String> html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath());
final Mono<Element> tbody = extractTbodyFromHtml(html)
.doOnNext(e -> log.info("pre tbody - {}", e.html()));
return extractArticlesFromTbody(tbody).map(this::convertFromElement)
.doOnNext(e -> log.info("parsed Result: {}", e));
.map(e -> e.updateBoardName(boardName))
.doOnNext(e -> log.info("parsed Result: {}", e))
.collectList();
}
private Mono<Element> extractTbodyFromHtml(Mono<String> html) {

View File

@ -12,34 +12,42 @@ import lombok.NoArgsConstructor;
@Getter
@NoArgsConstructor
public enum PpomppuBoardName {
PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1"),
PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4"),
PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5"),
PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6"),
PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8"),
PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9"),
PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10"),
PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11"),
PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12"),
PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13"),
PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15"),
PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1"),
PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7"),
PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8"),
PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3"),
PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4"),
PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9"),
PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5"),
PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2"),
PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11"),
PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10"),
PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6"),
PPOMPPU_DOMESTIC_ETC("zboard/zboard.php?id=ppomppu&category=1", true),
PPOMPPU_DOMESTIC_COMPUTER("zboard/zboard.php?id=ppomppu&category=4", true),
PPOMPPU_DOMESTIC_DIGITAL("zboard/zboard.php?id=ppomppu&category=5", true),
PPOMPPU_DOMESTIC_FOOD("zboard/zboard.php?id=ppomppu&category=6", true),
PPOMPPU_DOMESTIC_BOOK("zboard/zboard.php?id=ppomppu&category=8", true),
PPOMPPU_DOMESTIC_APPLIANCES("zboard/zboard.php?id=ppomppu&category=9", true),
PPOMPPU_DOMESTIC_PARENTING("zboard/zboard.php?id=ppomppu&category=10", true),
PPOMPPU_DOMESTIC_GIFTCARD("zboard/zboard.php?id=ppomppu&category=11", true),
PPOMPPU_DOMESTIC_CLOTHES("zboard/zboard.php?id=ppomppu&category=12", true),
PPOMPPU_DOMESTIC_COSMETIC("zboard/zboard.php?id=ppomppu&category=13", true),
PPOMPPU_DOMESTIC_OUTDOOR("zboard/zboard.php?id=ppomppu&category=15", true),
PPOMPPU_OVERSEA_ETC("zboard/zboard.php?id=ppomppu4&category=1", true),
PPOMPPU_OVERSEA_APPLIANCES("zboard/zboard.php?id=ppomppu4&category=7", true),
PPOMPPU_OVERSEA_TVAV("zboard/zboard.php?id=ppomppu4&category=8", true),
PPOMPPU_OVERSEA_COMPUTER("zboard/zboard.php?id=ppomppu4&category=3", true),
PPOMPPU_OVERSEA_DIGITAL("zboard/zboard.php?id=ppomppu4&category=4", true),
PPOMPPU_OVERSEA_MOBILEACCESSORY("zboard/zboard.php?id=ppomppu4&category=9", true),
PPOMPPU_OVERSEA_CLOTHES("zboard/zboard.php?id=ppomppu4&category=5", true),
PPOMPPU_OVERSEA_WATCH("zboard/zboard.php?id=ppomppu4&category=2", true),
PPOMPPU_OVERSEA_SHOES("zboard/zboard.php?id=ppomppu4&category=11", true),
PPOMPPU_OVERSEA_FOOD("zboard/zboard.php?id=ppomppu4&category=10", true),
PPOMPPU_OVERSEA_PARENTING("zboard/zboard.php?id=ppomppu4&category=6", true),
;
private String resourcePath;
private boolean crawlWithDefaultTimer;
PpomppuBoardName(String boardPath) {
PpomppuBoardName(String boardPath, boolean crawlWithDefaultTimer) {
this.resourcePath = boardPath;
this.crawlWithDefaultTimer = crawlWithDefaultTimer;
}
public static final String PPOMPPU_URL = "https://www.ppomppu.co.kr/";
public static String ofViewPageUrl(String articleUrl) {
return PPOMPPU_URL + "zboard/" + articleUrl;
}
}