diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java index ccf8d832..f7857be0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java @@ -1,10 +1,19 @@ package com.iqser.red.service.redaction.v1.server.utils; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; import java.text.DateFormat; import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.format.ResolverStyle; import java.util.Date; -import java.util.List; import java.util.Locale; +import java.util.Objects; import java.util.Optional; import lombok.AccessLevel; @@ -17,39 +26,65 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class DateConverter { - static List formats = List.of(new SimpleDateFormat("dd MMM yy", Locale.ENGLISH), - new SimpleDateFormat("dd MM yyyy", Locale.ENGLISH), - new SimpleDateFormat("dd MM yyyy.", Locale.ENGLISH), - new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH), - new SimpleDateFormat("MMMM dd, yyyy", Locale.ENGLISH), - new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH)); + private static DateTimeFormatter DATE_TIME_FORMATTER; public Optional parseDate(String dateAsString) { - Date date = null; - for (SimpleDateFormat format : formats) { - - try { - date = format.parse(dateAsString); - break; - } catch (Exception e) { - log.warn("Failed to parse date from string {}. \n{}", dateAsString, e.getMessage()); - // ignore, try next... - } - } - if (date == null) { + DateTimeFormatter formatter = getDateTimeFormatter(); + String cleanDate = dateAsString.trim(); + cleanDate = removeTrailingDot(cleanDate); + try { + LocalDate localDate = LocalDate.parse(cleanDate, formatter); + Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant()); + return Optional.of(date); + } catch (DateTimeParseException e) { + log.warn("Failed to parse date: {}", cleanDate); return Optional.empty(); } - return Optional.of(date); + } public String convertDate(Date date, String resultFormat) { DateFormat resultDateFormat = new SimpleDateFormat(resultFormat, Locale.ENGLISH); - return resultDateFormat.format(date); } + + private DateTimeFormatter getDateTimeFormatter() { + + if (DATE_TIME_FORMATTER == null) { + DATE_TIME_FORMATTER = createFormatterFromResource(); + } + return DATE_TIME_FORMATTER; + } + + + private DateTimeFormatter createFormatterFromResource() { + + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(DateConverter.class.getResourceAsStream("/date_formats.txt"))))) { + String line; + while ((line = reader.readLine()) != null) { + builder.appendOptional(DateTimeFormatter.ofPattern(line.trim(), Locale.ENGLISH)); + } + } catch (IOException e) { + throw new RuntimeException("Error reading date format file: " + e.getMessage()); + } + return builder.toFormatter().withResolverStyle(ResolverStyle.SMART).withLocale(Locale.ENGLISH); + } + + + private String removeTrailingDot(String dateAsString) { + + String str = dateAsString; + if (str != null && !str.isEmpty() && str.charAt(str.length() - 1) == '.') { + str = str.substring(0, str.length() - 1); + } + + return str; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt new file mode 100644 index 00000000..b4fb4351 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt @@ -0,0 +1,38 @@ +dd MMM yyyy +dd MMM yy +dd MM yyyy +dd MMMM yyyy +MMMM dd, yyyy +dd-MMM-yyyy +dd.MM.yyyy +yyyy/MM/dd +yyyy-MM-dd +dd-MM-yyyy +MMMM d, yyyy +d MMMM yyyy +MMM d, yyyy +d['.'] MMM yyyy +d-MMM-yyyy +d['th']['st']['nd']['rd'] 'of' MMMM, yyyy +MMMM d['th']['st']['nd']['rd'], yyyy +yyyy, MMMM d +yyyy.MM.dd +yyyyMMdd +dd-MM-yy +dd/MM/yy +MMMM d, yy +d MMMM, yy +d['th']['st']['nd']['rd'] MMM yyyy +MMM d['th']['st']['nd']['rd'], yy +yyyy-MMM-dd +MMM-dd-yyyy +dd-MM-yyyy +yyyy, MMMM dd +dd.MMM.yyyy +dd.MMMM.yyyy +dd.MMM.yy +dd.MMMM.yy +dd.MMM-yyyy +dd.MMMM-yyyy +d['th']['st']['nd']['rd'] MMMM yy +d['th']['st']['nd']['rd'] MMMM yyyy \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index cdafda65..c452b809 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.when; import java.io.FileOutputStream; @@ -106,6 +108,36 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { } + @Test + public void testConvertingVariousDateFormats() { + + AnalyzeRequest request = uploadFileToStorage("files/dates/date_formats.pdf"); + System.out.println("Start Full integration test"); + analyzeDocumentStructure(LayoutParsingType.DOCUMINE, request); + System.out.println("Finished structure analysis"); + analyzeService.analyze(request); + System.out.println("Finished analysis"); + + var componentLog = redactionStorageService.getComponentLog(TEST_DOSSIER_ID, TEST_FILE_ID); + assertNotNull(componentLog); + + var experimentalDates = componentLog.getComponentLogEntries().stream().filter(c -> c.getName().equals("Experimental_Starting_Date")).findFirst().get(); + assertNotNull(experimentalDates); + + String dates = experimentalDates.getComponentValues().get(0).getValue(); + String[] dateArray = dates.split(", "); + boolean allEqual = true; + for (String date : dateArray) { + if (!"10/01/2022".equals(date)) { + allEqual = false; + break; + } + } + + assertTrue(allEqual); + } + + @Test // @Disabled public void testTopOfPage13InNotHeader() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf new file mode 100644 index 00000000..553b18be Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf differ