Implement PpomppuBoardFeedRetriever

This commit is contained in:
woozu.shin 2021-09-26 00:26:32 +09:00
parent 08e1f99ab0
commit cf7425faae
7 changed files with 79 additions and 15 deletions

View File

@ -0,0 +1,48 @@
package com.myoa.engineering.crawl.ppomppu.processor.configuration.factory;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.reactive.function.client.ClientRequest;
import org.springframework.web.reactive.function.client.ClientResponse;
import org.springframework.web.reactive.function.client.ExchangeFilterFunction;
import reactor.core.publisher.Mono;
/**
* WebClientFilterFactory
*
* @author Shin Woo-jin (woozu.shin@kakaoent.com)
* @since 2021-09-07
*/
@Slf4j
public final class WebClientFilterFactory {
private WebClientFilterFactory() {}
public static ExchangeFilterFunction logRequest() {
return ExchangeFilterFunction.ofRequestProcessor(WebClientFilterFactory::writeRequest);
}
public static ExchangeFilterFunction logResponse() {
return ExchangeFilterFunction.ofResponseProcessor(WebClientFilterFactory::writeResponse);
}
private static Mono<ClientRequest> writeRequest(ClientRequest clientRequest) {
try {
log.info("[WEBCLIENT REQUEST] uri : {} method : {} headers : {}",
clientRequest.url(), clientRequest.method(), clientRequest.headers());
} catch (Exception e) {
log.error("[WEBCLIENT REQUEST] write request failed", e);
}
return Mono.just(clientRequest);
}
private static Mono<ClientResponse> writeResponse(ClientResponse clientResponse) {
try {
log.info("[WEBCLIENT REQUEST] statusCode : {} headers : {}",
clientResponse.rawStatusCode(), clientResponse.headers().asHttpHeaders());
} catch (Exception e) {
log.error("[WEBCLIENT RESPONSE] write response failed", e);
}
return Mono.just(clientResponse);
}
}

View File

@ -17,7 +17,7 @@ public final class WebFluxExchangeStragiesFactory {
private WebFluxExchangeStragiesFactory() {}
public static ExchangeStrategies getDefault() {
public static ExchangeStrategies ofDefault() {
final ObjectMapper mapper = ObjectMapperFactory.defaultMapper();
return ExchangeStrategies.builder()
.codecs(configurer -> {
@ -31,5 +31,19 @@ public final class WebFluxExchangeStragiesFactory {
})
.build();
}
public static ExchangeStrategies ofTextHtml() {
final ObjectMapper mapper = ObjectMapperFactory.defaultMapper();
return ExchangeStrategies.builder()
.codecs(configurer -> {
configurer.defaultCodecs().maxInMemorySize(-1);
configurer.defaultCodecs()
.jackson2JsonEncoder(new Jackson2JsonEncoder(mapper,
MimeTypeUtils.TEXT_HTML));
configurer.defaultCodecs()
.jackson2JsonDecoder(new Jackson2JsonDecoder(mapper,
MimeTypeUtils.TEXT_HTML));
})
.build();
}
}

View File

@ -1,9 +1,8 @@
package com.myoa.engineering.crawl.ppomppu.processor.infrastructure.client;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebClientFilterFactory;
import com.myoa.engineering.crawl.ppomppu.processor.configuration.factory.WebFluxExchangeStragiesFactory;
import com.myoa.engineering.crawl.ppomppu.support.util.WebUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.ParameterizedTypeReference;
import org.springframework.stereotype.Component;
import org.springframework.web.reactive.function.client.WebClient;
import org.springframework.web.reactive.function.client.WebClientRequestException;
@ -26,21 +25,22 @@ public class PpomppuBoardFeedRetriever {
public PpomppuBoardFeedRetriever(WebClient.Builder webClientBuilder) {
this.webClient = webClientBuilder.baseUrl(PPOMPPU_URL)
.exchangeStrategies(WebFluxExchangeStragiesFactory.getDefault())
.defaultHeader("Content-Type", "text/html")
.defaultHeader(WebUtil.HEADER_USER_AGENT_KEY, WebUtil.HEADER_USER_AGENT_VALUE)
.exchangeStrategies(WebFluxExchangeStragiesFactory.ofTextHtml())
.filter(WebClientFilterFactory.logRequest())
.filter(WebClientFilterFactory.logResponse())
.build();
}
public Mono<String> getHtml(String uri) {
return webClient.get()
.uri(uri)
.exchangeToMono(e -> e.bodyToMono(new ParameterizedTypeReference<String>() {}))
.uri("/zboard/zboard.php?id=ppomppu")
.exchangeToMono(e -> e.bodyToMono(String.class))
.publishOn(Schedulers.boundedElastic())
.onErrorResume(WebClientRequestException.class, t -> {
log.info("Exception occured, ignoring. : {}", t.getClass().getSimpleName());
return Mono.empty();
});
})
.doOnNext(e -> log.info("[getHtml] {}", e));
}
}

View File

@ -29,8 +29,8 @@ public class PpomppuFeedService {
public Flux<PpomppuArticle> getArticles(PpomppuBoardName boardName) {
final Mono<String> html = ppomppuBoardFeedRetriever.getHtml(boardName.getResourcePath());
final Mono<Element> tbody = extractTbodyFromHtml(html);
final Mono<Element> tbody = extractTbodyFromHtml(html)
.doOnNext(e -> log.info("pre tbody - {}", e.html()));
return extractArticlesFromTbody(tbody).map(this::convertFromElement)
.doOnNext(e -> log.info("parsed Result: {}", e));
}
@ -39,6 +39,7 @@ public class PpomppuFeedService {
return html.map(Jsoup::parse)
.mapNotNull(e -> e.getElementById("revolution_main_table"))
.map(e -> e.getElementsByTag("tbody"))
.doOnNext(e -> log.info("tbody - {}", e.html()))
.map(e -> e.stream()
.findFirst()
.orElseThrow(() -> new IndexOutOfBoundsException("no tbody")));
@ -46,7 +47,7 @@ public class PpomppuFeedService {
private Flux<Element> extractArticlesFromTbody(Mono<Element> tbody) {
return Flux.concat(tbody.flatMapMany(e -> Flux.fromArray(e.select("tr.list0").toArray(Element[]::new))),
tbody.flatMapMany(e -> Flux.fromArray(e.select("tr.list0").toArray(Element[]::new))));
tbody.flatMapMany(e -> Flux.fromArray(e.select("tr.list1").toArray(Element[]::new))));
}
private PpomppuArticle convertFromElement(Element element) {

View File

@ -3,4 +3,5 @@ spring:
activate:
on-profile: development
import:
- classpath:/development/webclient.yml
- classpath:/development/webclient.yml
- classpath:/development/database.yml

View File

@ -6,9 +6,9 @@ package com.myoa.engineering.crawl.ppomppu.support.util;
* @author Shin Woo-jin (woozu.shin@kakaoent.com)
* @since 2021-09-08
*/
public final class WebUtil {
public final class WebRequestUtil {
private WebUtil() {}
private WebRequestUtil() {}
public static final String HEADER_USER_AGENT_KEY = "User-Agent";
public static final String HEADER_USER_AGENT_VALUE = "Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36";