package com.myoa.engineering.crawl.ppomppu.processor.service; import com.myoa.engineering.crawl.ppomppu.processor.domain.PpomppuArticle; import com.myoa.engineering.crawl.ppomppu.processor.dto.PpomppuArticleParser; import com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client.PpomppuBoardFeedRetriever; import com.myoa.engineering.crawl.ppomppu.support.dto.code.PpomppuBoardName; import java.util.Comparator; import java.util.List; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.springframework.stereotype.Component; import reactor.core.publisher.Flux; import reactor.core.publisher.Mono; /** * PpomppuFeedService * * @author Shin Woo-jin (woozu.shin@kakaoent.com) * @since 2021-09-08 */ @Slf4j @Component public class PpomppuFeedService { private final PpomppuBoardFeedRetriever ppomppuBoardFeedRetriever; public PpomppuFeedService(PpomppuBoardFeedRetriever ppomppuBoardFeedRetriever) { this.ppomppuBoardFeedRetriever = ppomppuBoardFeedRetriever; } public Mono> getArticles(PpomppuBoardName boardName) { final Mono html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath()); final Mono tbody = extractTbodyFromHtml(html); // .doOnNext(e -> log.info("pre tbody - {}", e.html())); return extractArticlesFromTbody(tbody).map(this::convertFromElement) .map(e -> e.updateBoardName(boardName)) .sort(Comparator.comparing(PpomppuArticle::getArticleId)) // .doOnNext(e -> log.info("parsed Result: {}", e)) .collectList(); } private Mono extractTbodyFromHtml(Mono html) { return html.map(Jsoup::parse) .mapNotNull(e -> e.getElementById("revolution_main_table")) .map(e -> e.getElementsByTag("tbody")) // .doOnNext(e -> log.info("tbody - {}", e.html())) .map(e -> e.stream() .findFirst() .orElseThrow(() -> new IndexOutOfBoundsException("no tbody"))); } private Flux extractArticlesFromTbody(Mono tbody) { return Flux.concat(tbody.flatMapMany(e -> Flux.fromArray(e.select("tr.list0").toArray(Element[]::new))), tbody.flatMapMany(e -> Flux.fromArray(e.select("tr.list1").toArray(Element[]::new)))); } private PpomppuArticle convertFromElement(Element element) { return PpomppuArticleParser.toArticle(element.getElementsByTag("td")); } }