[NO-ISSUE] Implement v2

This commit is contained in:
woozu-shin
2024-04-30 22:34:49 +09:00
parent 0524a18ee5
commit 0c4be3cc05
147 changed files with 3873 additions and 546 deletions

View File

@@ -0,0 +1,118 @@
package com.myoa.engineering.crawl.shopping.crawlhandler.parser;
import com.myoa.engineering.crawl.shopping.domain.entity.v1.PpomppuArticle;
import com.myoa.engineering.crawl.shopping.support.dto.constant.PpomppuBoardName;
import com.myoa.engineering.crawl.shopping.util.DateTimeUtils;
import com.myoa.engineering.crawl.shopping.util.NumberUtils;
import com.myoa.engineering.crawl.shopping.util.TestDataUtils;
import io.micrometer.core.instrument.util.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import java.time.ZonedDateTime;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class PpomppuArticleParserV2Test {
@Test
void test1() {
String data = TestDataUtils.fileToString("testdata/zboard/file1.html");
Document document = Jsoup.parse(data);
Elements trList = document.getElementById("revolution_main_table").getElementsByTag("tr");
List<PpomppuArticle> articles = trList.stream()
.filter(this::isRealArticle)
.map(this::parse)
.toList();
System.out.println(articles.size());
System.out.println(articles.get(0));
System.out.println(articles.get(1));
}
private boolean isRealArticle(Element tr) {
Elements tdList = tr.getElementsByTag("td");
if (tdList.size() != 6) {
return false;
}
if (!hasOnlyNumeric(tdList.get(0))) {
return false;
}
return true;
}
Pattern pattern_numeric = Pattern.compile("\\d+");
private boolean hasOnlyNumeric(Element td) {
return pattern_numeric.matcher(td.text()).matches();
}
public PpomppuArticle parse(Element tr) {
Elements tdList = tr.getElementsByTag("td");
Long articleId = Long.parseLong(tdList.get(0).text());
String title = tdList.get(2).text();
String articleUrl = parseArticleUrl(tdList.get(2).getElementsByTag("a").attr("href"));
PpomppuBoardName boardName = parseBoardName(title);
Integer recommended = parseRecommended(tdList.get(4));
Integer hit = NumberUtils.parseInt(tdList.get(5).text(), 0);
ZonedDateTime registeredAt = DateTimeUtils.parse(tdList.get(3).text());
return PpomppuArticle.builder()
.articleId(articleId)
.title(title)
.boardName(boardName)
.articleUrl(articleUrl)
.recommended(recommended)
.hit(hit)
.registeredAt(registeredAt)
.build();
}
public Long parseArticleId(Element td) {
return Long.parseLong(td.text().trim());
}
public static Integer parseRecommended(Element td) {
final String voteString = td.text();
if (StringUtils.isEmpty(voteString)) {
return null;
}
final int voteUp = Integer.parseInt(td.text().split(" - ")[0]);
final int voteDown = Integer.parseInt(td.text().split(" - ")[1]);
int recommended = voteUp - voteDown;
return recommended;
}
public static String parseArticleUrl(String data) {
return PpomppuBoardName.ofViewPageUrl(data);
}
Pattern patternBoardName = Pattern.compile("\\[(.+?)\\]");
public PpomppuBoardName parseBoardName(String fullTitle) {
Matcher matcher = patternBoardName.matcher(fullTitle);
String lastMatched = null;
while (matcher.find()) {
lastMatched = matcher.group(1);
}
return PpomppuBoardName.ofBoardName(lastMatched, true);
}
@Test
void test2() {
PpomppuBoardName boardName = parseBoardName("[자사몰]푸마 메쉬 폼스트라이프 [에디션] 5종 [세트] (18,220원/무료)6 [의류/잡화]");
System.out.println(boardName);
}
}

View File

@@ -0,0 +1,81 @@
package com.myoa.engineering.crawl.shopping.event.handler;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.jeasy.random.EasyRandom;
import org.jeasy.random.EasyRandomParameters;
import org.junit.jupiter.api.Test;
import java.util.Collection;
import java.util.List;
class ArticleUpsertEventListenerTest {
@Test
public void test1() {
List<String> baseData = List.of(
"[공홈]베베숲 시그니처 위드 블루 20팩 (25,990원/무료)2 [기타]",
"[현대H몰]10주년 스페셜 에디션 봉고데기 40mm 세트 (67,640원/무료)3 [가전/가구]",
"[SSG]필립스 면도기 칫솔 기획전( 437,000원~/무료)1 [기타]",
"(티몬)제스프리 골드키위 중대과 1.8kg내외 (카페,토페 14,823원/무배)3 [식품/건강]",
"[공식몰]연세생활건강 당뇨영양식 24팩 (33,500원/유배)1 [식품/건강]",
"[인터파크]크리스탈라이트 아이스티 에이드 온더고 60개입 멀티팩(17,800원/무료)6 [기타]",
"[인팍쇼핑]샤카웨어 액티브 맥스 헤비웨이트 10종 택 1 (7,350원/무료)7 [의류/잡화]"
);
Trie trie = Trie.builder()
.ignoreCase()
.addKeyword("블루")
.addKeyword("봉고")
.build();
List<Collection<Emit>> list = baseData.stream()
.map(trie::parseText)
.toList();
System.out.println(list);
}
@Test
void test2() throws InterruptedException {
EasyRandomParameters paramsKeyword = new EasyRandomParameters()
.stringLengthRange(2, 10);
EasyRandom GEN_KEYWORD = new EasyRandom(paramsKeyword);
EasyRandomParameters paramsSentence = new EasyRandomParameters()
.stringLengthRange(100, 1000);
EasyRandom GEN_SENTENCE = new EasyRandom(paramsSentence);
Trie.TrieBuilder trieBuilder = Trie.builder().ignoreOverlaps();
GEN_KEYWORD.objects(TestA.class, 1000)
.map(TestA::getFieldA)
.forEach(trieBuilder::addKeyword);
Trie trie = trieBuilder.build();
List<Collection<Emit>> listOfEmits = GEN_SENTENCE.objects(TestA.class, 2000)
.map(TestA::getFieldA)
.map(trie::parseText)
.toList();
for(Collection<Emit> emits : listOfEmits) {
System.out.println(emits);
System.out.println("--------------------------");
}
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public class TestA {
private String fieldA;
}
}

View File

@@ -0,0 +1,62 @@
package com.myoa.engineering.crawl.shopping.util;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.lang.NonNull;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;
public final class TestDataUtils {
private static final ObjectMapper OBJECT_MAPPER;
static {
OBJECT_MAPPER = ObjectMapperFactory.DEFAULT_MAPPER;
}
private TestDataUtils() {
}
public static <T> T inputStreamToObject(@NonNull String resourcePath,
@NonNull Class<T> clazz) throws IOException {
return OBJECT_MAPPER.readValue(fileToString(resourcePath), clazz);
}
public static <T> List<T> inputStreamsToList(@NonNull Class<T> clazz,
@NonNull String... resourcePath) throws IOException {
return inputStreamsToList(Arrays.asList(resourcePath), clazz);
}
public static <T> List<T> inputStreamsToList(@NonNull List<String> resourcePath,
@NonNull Class<T> clazz) throws IOException {
return resourcePath.stream()
.map(e -> {
try {
return OBJECT_MAPPER.readValue(fileToString(e), clazz);
} catch (JsonProcessingException jsonProcessingException) {
jsonProcessingException.printStackTrace();
return null;
}
})
.collect(Collectors.toList());
}
public static <T> List<T> inputStreamToList(@NonNull String resourcePath,
@NonNull Class<T> clazz) throws IOException {
return (List<T>) OBJECT_MAPPER.readValue(
fileToString(resourcePath),
OBJECT_MAPPER.getTypeFactory().constructCollectionType(List.class, clazz));
}
public static String fileToString(@NonNull String resourcePath) {
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
final Scanner s = new Scanner(classLoader.getResourceAsStream(resourcePath)).useDelimiter("\\A");
return s.hasNext() ? s.next() : "";
}
}

View File

@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<included>
<!-- =========== property BETA ========= -->
<property name="DEFAULT_LEVEL" value="${DEFAULT_LEVEL_CONFIG:-INFO}"/>
<!-- =========== include appender =========== -->
<include resource="org/springframework/boot/logging/logback/defaults.xml"/>
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
<!-- =========== root logger ============== -->
<root level="${DEFAULT_LEVEL}">
<appender-ref ref="CONSOLE"/>
</root>
</included>

View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuratiown>
<springProperty name="DEFAULT_LEVEL_CONFIG" source="log.defaultLevel"/>
<include resource="logback-development.xml"/>
</configuratiown>

File diff suppressed because it is too large Load Diff