1 package com.acumenvelocity.ath.common;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.net.URI;
8 import java.nio.charset.StandardCharsets;
9 import java.nio.file.Files;
10 import java.nio.file.Paths;
11 import java.util.ArrayList;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.Enumeration;
15 import java.util.Iterator;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.ResourceBundle;
19 import java.util.TreeMap;
20 import java.util.UUID;
21
22 import org.apache.solr.common.SolrDocument;
23 import org.apache.solr.common.SolrInputDocument;
24 import org.bson.types.ObjectId;
25
26 import com.acumenvelocity.ath.common.exception.AthRuntimeException;
27 import com.acumenvelocity.ath.filters.pdf.AthPdfFilter;
28 import com.acumenvelocity.ath.filters.pdf.Parameters;
29 import com.acumenvelocity.ath.gcs.AthStorage;
30 import com.acumenvelocity.ath.model.EncodingInfo;
31 import com.acumenvelocity.ath.model.FilterInfo;
32 import com.acumenvelocity.ath.model.LanguageInfo;
33 import com.acumenvelocity.ath.model.MtResources;
34 import com.acumenvelocity.ath.model.OcrMode;
35 import com.acumenvelocity.ath.model.ParametersFormat;
36 import com.acumenvelocity.ath.solr.doc.SolrDocMergerStep;
37 import com.acumenvelocity.ath.solr.doc.SolrDocWriterStep;
38 import com.acumenvelocity.ath.solr.tm.SolrTmConnector;
39 import com.acumenvelocity.ath.srx.SrxFileMapper;
40 import com.acumenvelocity.ath.steps.BatchMtParameters;
41 import com.acumenvelocity.ath.steps.BatchMtStep;
42 import com.acumenvelocity.ath.steps.CodesReinsertionStep;
43 import com.acumenvelocity.ath.steps.MtConfidenceScoringStep;
44 import com.acumenvelocity.ath.steps.MtLeveragingStep;
45 import com.acumenvelocity.ath.steps.SegmentTrimmerStep;
46 import com.fasterxml.jackson.databind.JsonNode;
47 import com.google.auth.oauth2.GoogleCredentials;
48 import com.google.auth.oauth2.ServiceAccountCredentials;
49
50 import io.swagger.oas.inflector.models.ResponseContext;
51 import net.sf.okapi.applications.rainbow.MainForm;
52 import net.sf.okapi.applications.rainbow.lib.EncodingItem;
53 import net.sf.okapi.applications.rainbow.lib.EncodingManager;
54 import net.sf.okapi.applications.rainbow.lib.LanguageItem;
55 import net.sf.okapi.applications.rainbow.lib.LanguageManager;
56 import net.sf.okapi.common.BOMNewlineEncodingDetector;
57 import net.sf.okapi.common.IParametersEditorMapper;
58 import net.sf.okapi.common.ListUtil;
59 import net.sf.okapi.common.LocaleId;
60 import net.sf.okapi.common.MimeTypeMapper;
61 import net.sf.okapi.common.StringUtil;
62 import net.sf.okapi.common.Util;
63 import net.sf.okapi.common.filters.FilterConfiguration;
64 import net.sf.okapi.common.filters.IFilter;
65 import net.sf.okapi.common.filters.IFilterConfigurationMapper;
66 import net.sf.okapi.common.filters.SharedFilterConfigurationMapper;
67 import net.sf.okapi.common.pipeline.IPipelineStep;
68 import net.sf.okapi.common.pipeline.PipelineReturnValue;
69 import net.sf.okapi.common.pipelinebuilder.XBatch;
70 import net.sf.okapi.common.pipelinebuilder.XBatchItem;
71 import net.sf.okapi.common.pipelinebuilder.XDocument;
72 import net.sf.okapi.common.pipelinebuilder.XParameter;
73 import net.sf.okapi.common.pipelinebuilder.XPipeline;
74 import net.sf.okapi.common.pipelinebuilder.XPipelineStep;
75 import net.sf.okapi.connectors.google.GoogleMTv2Connector;
76 import net.sf.okapi.connectors.google.GoogleMTv2Parameters;
77 import net.sf.okapi.steps.common.FilterEventsToRawDocumentStep;
78 import net.sf.okapi.steps.common.RawDocumentToFilterEventsStep;
79 import net.sf.okapi.steps.desegmentation.DesegmentationStep;
80 import net.sf.okapi.steps.heuristicaligner.HeuristicSentenceAlignerStep;
81 import net.sf.okapi.steps.leveraging.LeveragingStep;
82 import net.sf.okapi.steps.llmsentencealigner.LlmSentenceAlignerStep;
83 import net.sf.okapi.steps.segmentation.SegmentationStep;
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 public class ControllerUtil {
109 private static final String BUNDLE_NAME = "net.sf.okapi.common.filters.DefaultFilters";
110 private static String projectId;
111
112 private static final List<LanguageInfo> languageInfos;
113 private static final List<EncodingInfo> encodingInfos;
114 private static final List<FilterInfo> filterInfos;
115
116 private static IFilterConfigurationMapper fcMapper = new SharedFilterConfigurationMapper();
117 private static Map<String, String> extensionsMap = new TreeMap<>();
118
119 static {
120
121
122
123 fcMapper.addConfigurations(AthPdfFilter.class.getName());
124
125 languageInfos = createLanguageInfos();
126 encodingInfos = createEncodingInfos();
127 filterInfos = createFilterInfos();
128
129 projectId = getProjectIdFromKeyFile();
130 }
131
132 public static List<LanguageInfo> getLanguageInfos() {
133 return languageInfos;
134 }
135
136 public static List<EncodingInfo> getEncodingInfos() {
137 return encodingInfos;
138 }
139
140 public static List<FilterInfo> getFilterInfos() {
141 return filterInfos;
142 }
143
144 public static IFilterConfigurationMapper getFcMapper() {
145 return fcMapper;
146 }
147
148 public static boolean checkObjectIdParam(String param) {
149 return checkParam(param) && ObjectId.isValid(param);
150 }
151
152
153
154
155
156
157
158
159
160
161 public static void setMappings(IFilterConfigurationMapper fcMapper, boolean reset,
162 boolean addConfigurations) {
163
164
165 ResourceBundle res = ResourceBundle.getBundle(BUNDLE_NAME);
166 Enumeration<String> keys = res.getKeys();
167 ArrayList<String> list = Collections.list(keys);
168
169 if (reset) {
170 fcMapper.clearConfigurations(false);
171 ((IParametersEditorMapper) fcMapper).clearDescriptionProviders();
172 ((IParametersEditorMapper) fcMapper).clearEditors();
173 }
174
175
176 for (String key : list) {
177
178 if (!key.startsWith("filterClass"))
179 continue;
180
181 try {
182 int n = key.indexOf('_');
183 String suffix = key.substring(n);
184 String value = res.getString(key);
185
186
187 if (addConfigurations) {
188 fcMapper.addConfigurations(value);
189 }
190
191 String key2 = "parametersClass" + suffix;
192
193 if (list.contains(key2)) {
194 String paramsClass = res.getString(key2);
195
196
197 String key3 = "parametersEditorClass" + suffix;
198
199 if (list.contains(key3)) {
200 value = res.getString(key3);
201 ((IParametersEditorMapper) fcMapper).addEditor(value, paramsClass);
202
203 } else {
204 key3 = "editorDescriptionProvider" + suffix;
205
206 if (list.contains(key3)) {
207 value = res.getString(key3);
208 ((IParametersEditorMapper) fcMapper).addDescriptionProvider(value, paramsClass);
209 }
210 }
211 }
212
213 } catch (Exception ex) {
214 Log.warn(ControllerUtil.class,
215 "Error while trying to build filter for the property key '{}' -- {}", key,
216 ex.getMessage());
217
218 continue;
219 }
220 }
221 }
222
223 private static List<FilterInfo> createFilterInfos() {
224 List<FilterInfo> filterInfos = new ArrayList<>();
225 extensionsMap.clear();
226
227 setMappings(fcMapper, false, true);
228 Iterator<FilterConfiguration> iter = fcMapper.getAllConfigurations();
229 StringBuilder sb = new StringBuilder();
230
231 while (iter.hasNext()) {
232 FilterConfiguration fc = iter.next();
233
234 try {
235 IFilter filter = fcMapper.createFilter(fc.configId);
236 sb.append(Log.format("{} -- {}\n", fc.configId, fc.name));
237
238 String paramsSt = filter == null || filter.getParameters() == null ? ""
239 : filter.getParameters().toString();
240
241 FilterInfo fi = new FilterInfo()
242 .id(fc.configId)
243 .name(fc.name)
244 .description(fc.description)
245 .custom(fc.custom)
246 .mimeType(fc.mimeType)
247 .fileExtensions(ListUtil.stringAsList(fc.extensions, ";"))
248 .parameters(paramsSt)
249 .parametersFormat(getFormat(paramsSt));
250
251 for (String ext : fi.getFileExtensions()) {
252 extensionsMap.put(ext, fi.getId());
253 }
254
255 filterInfos.add(fi);
256
257 } catch (Exception e) {
258 Log.warn(ControllerUtil.class, "Cannot instantiate the filter '{}' -- {}", fc.filterClass,
259 e.getMessage());
260
261 continue;
262 }
263 }
264
265 Collections.sort(filterInfos,
266 Comparator.comparing(FilterInfo::getName, String.CASE_INSENSITIVE_ORDER));
267
268 Log.debug(ControllerUtil.class, "\n\n------ Filter configs:\n{}\n\n", sb.toString());
269 return Collections.unmodifiableList(filterInfos);
270
271 }
272
273 private static ParametersFormat getFormat(String st) {
274 if (st.startsWith("#v")) {
275 return ParametersFormat.FPRM;
276
277 } else if (st.startsWith("<?xml")) {
278 return ParametersFormat.XML;
279
280 } else {
281 return ParametersFormat.YAML;
282 }
283 }
284
285 private static List<EncodingInfo> createEncodingInfos() {
286 List<EncodingInfo> encodings = new ArrayList<>();
287 EncodingManager em = new EncodingManager();
288
289 try (InputStream is = MainForm.class.getResourceAsStream("/shared/encodings.xml")) {
290 em.loadList(is);
291 EncodingItem ei;
292
293 for (int i = 0; i < em.getCount(); i++) {
294 ei = em.getItem(i);
295
296 EncodingInfo item = new EncodingInfo()
297 .name(ei.name)
298 .codePage(ei.codePage)
299 .ianaName(ei.ianaName);
300
301 encodings.add(item);
302 }
303
304 } catch (IOException e) {
305 Log.warn(ControllerUtil.class, "Error getting encodings -- {}", e.getMessage());
306 }
307
308 return Collections.unmodifiableList(encodings);
309 }
310
311 private static List<LanguageInfo> createLanguageInfos() {
312 List<LanguageInfo> langInfos = new ArrayList<>();
313
314 LanguageManager lm = new LanguageManager();
315 LanguageItem li;
316
317 for (int i = 0; i < lm.getCount(); i++) {
318 li = lm.getItem(i);
319
320 LanguageInfo item = new LanguageInfo();
321
322 item.setName(li.name);
323 item.setIsoCode(li.code);
324
325 langInfos.add(item);
326 }
327
328 return Collections.unmodifiableList(langInfos);
329 }
330
331 public static boolean checkParam(String param) {
332
333
334 return !Util.isEmpty(param) && !"null".equalsIgnoreCase(param);
335 }
336
337 public static boolean checkParam(Object param) {
338 return param != null;
339 }
340
341
342
343
344
345
346
347
348
349
350 public static <T> List<T> getPage(List<T> sourceList, int page, int pageSize) {
351 if (pageSize <= 0 || page <= 0) {
352 throw new IllegalArgumentException("Invalid page size: " + pageSize);
353 }
354
355 int fromIndex = (page - 1) * pageSize;
356
357 if (sourceList == null || sourceList.size() <= fromIndex) {
358 return Collections.emptyList();
359 }
360
361
362 return sourceList.subList(fromIndex, Math.min(fromIndex + pageSize, sourceList.size()));
363 }
364
365 public static String getFilterId(String fileExt) {
366 return extensionsMap.get(fileExt);
367 }
368
369 public static boolean checkUuidParam(String param) {
370 return checkParam(param) && isValidUuid(param);
371 }
372
373 private static boolean isValidUuid(String uuid) {
374
375 String uuidPattern = "^[0-9a-fA-F]{8}-?[0-9a-fA-F]{4}-?[0-9a-fA-F]{4}-?[0-9a-fA-F]{4}-?"
376 + "[0-9a-fA-F]{12}$";
377
378 return uuid.matches(uuidPattern);
379 }
380
381 public static ResponseContext importFile(
382 SolrInputDocument doc,
383 UUID docId,
384 String docFileName,
385 URI docGcsUrl,
386 String docEncoding,
387 String srcLang,
388 String trgLang,
389 String filterId,
390 String filterParams,
391 String srcSrx,
392 UUID tmId,
393 Integer tmThreshold,
394 String mtEngineId,
395 String mtEngineParams,
396 List<MtResources> mtCustomResources,
397 boolean mtProvideConfidenceScores,
398 boolean mtUseTranslateLlm,
399 boolean mtSendPlainText,
400 boolean useCodesReinsertionModel,
401 String codesReinsertionModelName,
402 UUID userId,
403 boolean newDoc) throws Exception {
404
405 IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
406
407 if (Util.isEmpty(filterId)) {
408
409 String fileExt = Util.getExtension(docFileName);
410
411 if (Util.isEmpty(fileExt)) {
412 throw new AthRuntimeException("No file extension or filterId specified");
413
414 } else {
415 filterId = ControllerUtil.getFilterId(fileExt);
416
417 if (Util.isEmpty(filterId)) {
418 throw new AthRuntimeException("Unknown file extension");
419 }
420 }
421 }
422
423 try (IFilter filter = fcMapper.createFilter(filterId)) {
424 if (!Util.isEmpty(filterParams)) {
425
426 filter.getParameters().fromString(filterParams);
427 }
428
429 doc.setField(Const.ATH_PROP_FILTER_ID, filterId);
430
431 if (filterParams == null && filter.getParameters() != null) {
432 filterParams = filter.getParameters().toString();
433 }
434
435 if (filterParams != null) {
436 doc.setField(Const.ATH_PROP_FILTER_PARAMS, filterParams);
437 }
438
439 Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
440
441 if (Util.isEmpty(srcSrx)) {
442 srcSrx = getSrx(srcLang);
443 }
444
445 doc.setField(Const.ATH_PROP_SRC_SRX, srcSrx);
446
447 LeveragingStep ls = tmId == null ? null : new LeveragingStep();
448
449 if (ls != null) {
450 net.sf.okapi.steps.leveraging.Parameters lsParams = ls.getParameters();
451 lsParams.setResourceClassName(SolrTmConnector.class.getName());
452
453 com.acumenvelocity.ath.solr.tm.Parameters connParams = new com.acumenvelocity.ath.solr.tm.Parameters();
454 connParams.setTmId(tmId.toString());
455
456 lsParams.setResourceParameters(connParams.toString());
457 lsParams.setThreshold(tmThreshold);
458 lsParams.setFillTarget(true);
459 lsParams.setFillTargetThreshold(tmThreshold);
460 }
461
462 boolean useMt = !Util.isEmpty(mtEngineId);
463 IPipelineStep mts = null;
464
465 if (useMt) {
466 switch (mtEngineId) {
467 case Const.MT_PROVIDER_GOOGLE_MT:
468 MtLeveragingStep v2Mts = new MtLeveragingStep(mtSendPlainText);
469 mts = v2Mts;
470
471 net.sf.okapi.steps.leveraging.Parameters lsParams = v2Mts.getParameters();
472 lsParams.setResourceClassName(GoogleMTv2Connector.class.getName());
473
474 if (Util.isEmpty(mtEngineParams)) {
475 GoogleMTv2Parameters connParams = new GoogleMTv2Parameters();
476 connParams.setApiKey(Const.ATH_GCT_API_KEY);
477 connParams.setRetryIntervalMs(0);
478 connParams.setRetryCount(1);
479
480 lsParams.setResourceParameters(connParams.toString());
481
482 } else {
483 lsParams.setResourceParameters(mtEngineParams);
484 }
485
486 break;
487
488 case Const.MT_PROVIDER_GOOGLE_MT_V3:
489 BatchMtStep bmts = new BatchMtStep();
490 mts = bmts;
491
492 BatchMtParameters bmtsParams = bmts.getParameters();
493
494 bmtsParams.setApiKey(Const.ATH_GCT_API_KEY);
495 bmtsParams.setProjectLocation(Const.ATH_GCP_PROJECT_LOCATION);
496 bmtsParams.setCredentialsPath(Const.ATH_GCP_SECRET_FILE);
497
498 boolean isCustomModel = mtCustomResources != null && mtCustomResources.size() > 0;
499
500 if (isCustomModel) {
501 MtResources res = mtCustomResources.get(0);
502
503 bmtsParams.setGlossaryProjectId(res.getMtGlossaryProjectId());
504 bmtsParams.setGlossaryProjectLocation(res.getMtGlossaryProjectLocation());
505 bmtsParams.setGlossaryId(res.getMtGlossaryId());
506 bmtsParams.setModelProjectId(res.getMtModelProjectId());
507 bmtsParams.setModelProjectLocation(res.getMtModelProjectLocation());
508 bmtsParams.setModelId(res.getMtModelId());
509 }
510
511 bmtsParams.setMimeType(mtSendPlainText ? MimeTypeMapper.PLAIN_TEXT_MIME_TYPE
512 : MimeTypeMapper.HTML_MIME_TYPE);
513
514 bmtsParams.setRetryIntervalMs(0);
515 bmtsParams.setRetryCount(1);
516 bmtsParams.setFailuresBeforeAbort(-1);
517
518 bmtsParams.setMtUseTranslateLlm(mtUseTranslateLlm);
519 bmtsParams.setMtSendPlainText(mtSendPlainText);
520 break;
521
522 default:
523 break;
524 }
525 }
526
527
528
529
530
531
532
533
534
535 if (filter instanceof AthPdfFilter) {
536 Parameters params = (Parameters) filter.getParameters();
537
538 if (params != null && params.getOcrMode() == OcrMode.AUTO) {
539 File docFile = AthUtil.createTempFile();
540
541 try {
542 AthStorage.storeFile(docGcsUrl, MimeTypeMapper.PDF_MIME_TYPE, docFile);
543 boolean needsOcr = PdfUtil.needsOcr(docFile);
544
545 if (needsOcr) {
546 params.setOcrMode(OcrMode.ENABLED);
547
548 } else {
549 params.setOcrMode(OcrMode.DISABLED);
550 }
551
552 } catch (Exception e) {
553 Log.error(ControllerUtil.class, e, "PDF conversion error");
554
555 } finally {
556 try {
557 Files.deleteIfExists(docFile.toPath());
558
559 } catch (Exception ignored) {
560 }
561 }
562 }
563 }
564
565
566 try (InputStream is = AthStorage.getInputStream(docGcsUrl);
567
568 XPipeline pl = new XPipeline(
569 "Import pipeline",
570
571 new XBatch(
572 new XBatchItem(
573 new XDocument(
574 is,
575 docEncoding,
576 LocaleId.fromString(srcLang),
577 LocaleId.fromString(trgLang)))),
578
579 new RawDocumentToFilterEventsStep(filter),
580
581 new XPipelineStep(
582 new SegmentationStep(),
583 new XParameter("sourceSrx", srcSrx),
584 new XParameter("targetSrx", null),
585 new XParameter("segmentSource", !Util.isEmpty(srcSrx)),
586 new XParameter("segmentTarget", false),
587 new XParameter("copySource", false)),
588
589 new SegmentTrimmerStep(),
590 ls,
591
592 mtProvideConfidenceScores
593 ? new MtConfidenceScoringStep(mtCustomResources, mtSendPlainText)
594 : mts,
595
596 mtSendPlainText ?
597
598
599 new CodesReinsertionStep(useCodesReinsertionModel, codesReinsertionModelName)
600 : null,
601
602 new SolrDocWriterStep(docId, docFileName, userId, newDoc))) {
603
604 PipelineReturnValue res = pl.execute();
605
606 if (res == PipelineReturnValue.SUCCEDED) {
607 return Response.success(200);
608 }
609 }
610
611 return Response.error(500, "Import failed");
612
613 } catch (Exception e) {
614 String st = newDoc ? "Import failed" : "Update failed";
615 Log.error(ControllerUtil.class, e, st);
616 return Response.error(500, e, st);
617 }
618 }
619
620 public static ResponseContext alignFile(
621 SolrInputDocument doc,
622 UUID docId,
623 File docFile,
624 String docFileName,
625 URI docGcsUrl,
626 String docEncoding,
627 URI docTrlGcsUrl,
628 String docTrlEncoding,
629 String srcLang,
630 String trgLang,
631 String filterId,
632 String filterParams,
633 String srcSrx,
634 String trgSrx,
635 boolean mtSendPlainText,
636 boolean useAlignmentModel,
637 String alignmentModelName,
638 boolean useCodesReinsertionModel,
639 String codesReinsertionModelName,
640 UUID userId) throws Exception {
641
642 boolean alignWithTranslation = docTrlGcsUrl != null;
643 IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
644
645 if (Util.isEmpty(filterId)) {
646
647 String fileExt = Util.getExtension(docFileName);
648
649 if (Util.isEmpty(fileExt)) {
650 throw new AthRuntimeException("No file extension or filterId specified");
651
652 } else {
653 filterId = ControllerUtil.getFilterId(fileExt);
654
655 if (Util.isEmpty(filterId)) {
656 throw new AthRuntimeException("Unknown file extension");
657 }
658 }
659 }
660
661 try (IFilter filter = fcMapper.createFilter(filterId)) {
662 if (!Util.isEmpty(filterParams)) {
663
664 filter.getParameters().fromString(filterParams);
665 }
666
667 doc.setField(Const.ATH_PROP_FILTER_ID, filterId);
668
669 if (filterParams == null && filter.getParameters() != null) {
670 filterParams = filter.getParameters().toString();
671 }
672
673 if (filterParams != null) {
674 doc.setField(Const.ATH_PROP_FILTER_PARAMS, filterParams);
675 }
676
677 Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
678
679 if (Util.isEmpty(srcSrx)) {
680 srcSrx = getSrx(srcLang);
681 }
682
683 doc.setField(Const.ATH_PROP_SRC_SRX, srcSrx);
684
685 if (Util.isEmpty(trgSrx)) {
686 if (alignWithTranslation) {
687 trgSrx = getSrx(trgLang);
688
689 } else {
690
691 }
692 }
693
694 if (!Util.isEmpty(trgSrx)) {
695 doc.setField(Const.ATH_PROP_TRG_SRX, trgSrx);
696 }
697
698
699
700 if (filter instanceof AthPdfFilter && docFile != null) {
701 Parameters params = (Parameters) filter.getParameters();
702
703 if (params != null && params.getOcrMode() == OcrMode.AUTO) {
704 boolean needsOcr = PdfUtil.needsOcr(docFile);
705
706 if (needsOcr) {
707 params.setOcrMode(OcrMode.ENABLED);
708
709 } else {
710 params.setOcrMode(OcrMode.DISABLED);
711 }
712 }
713 }
714
715 if (alignWithTranslation) {
716
717 File srcSrxFile = AthUtil.createTempFile();
718 File trgSrxFile = AthUtil.createTempFile();
719
720 try {
721
722 StringUtil.writeString(srcSrx, srcSrxFile);
723 StringUtil.writeString(trgSrx, trgSrxFile);
724
725 try (InputStream sis = AthStorage.getInputStream(docGcsUrl);
726 InputStream tis = AthStorage.getInputStream(docTrlGcsUrl);
727 IFilter targetFilter = fcMapper.createFilter(filterId);
728
729 XPipeline pl = new XPipeline(
730 "Alignment pipeline",
731
732 new XBatch(
733 new XBatchItem(
734 new XDocument(
735 sis,
736 docEncoding,
737 LocaleId.fromString(srcLang),
738 LocaleId.fromString(trgLang)),
739
740 new XDocument(
741 tis,
742 docTrlEncoding,
743 LocaleId.fromString(trgLang),
744 LocaleId.fromString(trgLang)))),
745
746 new RawDocumentToFilterEventsStep(filter),
747
748 !useAlignmentModel
749 ? new XPipelineStep(
750 new HeuristicSentenceAlignerStep(targetFilter),
751 new XParameter("generateTMX", false),
752 new XParameter("tmxOutputPath", null),
753 new XParameter("collapseWhitespace", false),
754 new XParameter("outputOneTOneMatchesOnly", false),
755 new XParameter("forceSimpleOneToOneAlignment", false),
756 new XParameter("segmentSource", true),
757 new XParameter("useCustomSourceRules", true),
758 new XParameter("customSourceRulesPath", srcSrxFile.getAbsolutePath()),
759 new XParameter("segmentTarget", true),
760 new XParameter("useCustomTargetRules", true),
761 new XParameter("customTargetRulesPath", trgSrxFile.getAbsolutePath()))
762
763 : new XPipelineStep(
764 new LlmSentenceAlignerStep(targetFilter),
765 new XParameter("useLlmAlignment", true),
766 new XParameter("llmModel", alignmentModelName),
767 new XParameter("useCodesReinsertionModel", useCodesReinsertionModel),
768 new XParameter("maxParagraphsPerRequest", 1000),
769 new XParameter("logAlignmentDetails", true),
770 new XParameter("collapseWhitespace", true),
771
772 new XParameter("segmentSource", !Util.isEmpty(srcSrx)),
773 new XParameter("useCustomSourceRules", !Util.isEmpty(srcSrx)),
774 new XParameter("customSourceRulesPath", srcSrxFile.getAbsolutePath()),
775
776 new XParameter("segmentTarget", !Util.isEmpty(trgSrx)),
777 new XParameter("useCustomTargetRules", !Util.isEmpty(trgSrx)),
778 new XParameter("customTargetRulesPath", trgSrxFile.getAbsolutePath())),
779
780
781
782
783
784
785 new CodesReinsertionStep(useCodesReinsertionModel, codesReinsertionModelName),
786 new SolrDocWriterStep(docId, docFileName, userId, false))) {
787
788
789
790
791
792 PipelineReturnValue res = pl.execute();
793
794 if (res == PipelineReturnValue.SUCCEDED) {
795 return Response.success(200);
796 }
797 }
798
799 } finally {
800 if (srcSrxFile != null) {
801 srcSrxFile.delete();
802 }
803
804 if (trgSrxFile != null) {
805 trgSrxFile.delete();
806 }
807 }
808 }
809
810 return Response.error(500, "Alignment failed");
811
812 } catch (Exception e) {
813 String st = "Alignment failed";
814 Log.error(ControllerUtil.class, e, st);
815 return Response.error(500, e, st);
816 }
817 }
818
819
820
821
822
823
824
825 private static String getSrx(String lang) {
826 String fn = SrxFileMapper.getSrxFileName(lang);
827
828 String srxFileName = (fn == null)
829 ? "/srx/alternate-default.srx"
830
831 : "/srx/memoq/default/v9.0/" + fn;
832
833 try (InputStream in = ControllerUtil.class.getResourceAsStream(srxFileName)) {
834 if (in == null) {
835 throw new IllegalArgumentException("SRX file not found: " + srxFileName);
836 }
837
838
839 BOMNewlineEncodingDetector detector = new BOMNewlineEncodingDetector(in,
840 StandardCharsets.UTF_8);
841
842 detector.detectAndRemoveBom();
843
844
845 byte[] data = detector.getInputStream().readAllBytes();
846 return new String(data, StandardCharsets.UTF_8);
847
848 } catch (Exception e) {
849 throw new RuntimeException("Failed to read SRX file: " + srxFileName, e);
850 }
851 }
852
853 public static ResponseContext exportFile(
854 SolrDocument doc,
855 SolrDocument tmDoc,
856 SolrInputDocument updateDoc,
857 URI docOutGcsUrl,
858 String docOutEncoding,
859 UUID tmId,
860 UUID userId) throws Exception {
861
862 String catFrameworkName = SolrUtil.safeGetField(doc, Const.ATH_PROP_CAT_FRAMEWORK_NAME,
863 null);
864
865 String catFrameworkVersion = SolrUtil.safeGetField(doc, Const.ATH_PROP_CAT_FRAMEWORK_VERSION,
866 null);
867
868
869
870
871
872
873
874
875
876
877
878
879
880 if (catFrameworkName != null && !catFrameworkName.equals(Const.CAT_FRAMEWORK_NAME)) {
881 throw new AthRuntimeException("CAT framework mismatch -- expected: {}, actual: {}",
882 catFrameworkName, Const.CAT_FRAMEWORK_NAME);
883 }
884
885 if (catFrameworkVersion != null && !catFrameworkVersion.equals(Const.CAT_FRAMEWORK_VERSION)) {
886 Log.warn(ControllerUtil.class,
887 "Export success is not guaranteed because of the {} version mismatch -- expected: {}, "
888 + "actual: {}",
889 Const.CAT_FRAMEWORK_NAME, catFrameworkVersion, Const.CAT_FRAMEWORK_VERSION);
890 }
891
892 IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
893
894 String docId = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_ID, null);
895 String filterId = SolrUtil.safeGetField(doc, Const.ATH_PROP_FILTER_ID, null);
896 String filterParams = SolrUtil.safeGetField(doc, Const.ATH_PROP_FILTER_PARAMS, null);
897 String docStorageName = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_STORAGE_NAME, null);
898
899 String docFileEncoding = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_FILE_ENCODING,
900 null);
901
902 String srcLang = SolrUtil.safeGetField(doc, Const.ATH_PROP_SRC_LANG, null);
903 String trgLang = SolrUtil.safeGetField(doc, Const.ATH_PROP_TRG_LANG, null);
904
905
906
907 IFilter filter = fcMapper.createFilter(filterId);
908
909
910 filter.getParameters().fromString(filterParams);
911
912 Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
913
914 File outFile = null;
915
916 try {
917 outFile = AthUtil.createTempFile();
918
919 try (InputStream is = AthStorage.getInputStream(AthUtil.toURI(docStorageName));
920 XPipeline pl = new XPipeline(
921 "Export pipeline",
922
923 new XBatch(
924 new XBatchItem(
925 new XDocument(
926 is,
927 docFileEncoding,
928 outFile.getAbsolutePath(),
929 docOutEncoding,
930 LocaleId.fromString(srcLang),
931 LocaleId.fromString(trgLang)))),
932
933 new RawDocumentToFilterEventsStep(filter),
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949 new SolrDocMergerStep(docId),
950 new DesegmentationStep(),
951 new FilterEventsToRawDocumentStep())) {
952
953 PipelineReturnValue res = pl.execute();
954
955 if (res == PipelineReturnValue.SUCCEDED) {
956
957 AthStorage.storeFile(docOutGcsUrl, filter.getMimeType(), outFile);
958
959 return Response.success(200);
960 }
961 }
962
963 return Response.error(500, "Export failed");
964
965 } catch (Exception e) {
966 Log.error(ControllerUtil.class, e, "Export failed");
967 return Response.error(500, e, "Export failed");
968 }
969 }
970
971 public static String getProjectId() {
972 return projectId;
973 }
974
975
976
977
978
979 private static String getProjectIdFromKeyFile() {
980 try (FileInputStream fis = new FileInputStream(Const.ATH_GCP_SECRET_FILE)) {
981 GoogleCredentials credentials = GoogleCredentials.fromStream(fis);
982
983 if (credentials instanceof ServiceAccountCredentials) {
984 ServiceAccountCredentials sac = (ServiceAccountCredentials) credentials;
985 String projectId = sac.getProjectId();
986
987 if (projectId != null && !projectId.isBlank()) {
988 return projectId;
989 }
990 }
991
992 } catch (Exception e) {
993
994
995 Log.warn(ConversionUtil.class,
996 "Official credential parsing failed ({}), falling back to JSON parsing",
997 e.getMessage());
998 }
999
1000
1001 try {
1002 String json = Files.readString(Paths.get(Const.ATH_GCP_SECRET_FILE));
1003 JsonNode jsonNode = JacksonUtil.makeNode(json);
1004 return jsonNode.get(json) == null ? null : jsonNode.get(json).asText();
1005
1006 } catch (Exception ex) {
1007 Log.error(ConversionUtil.class, ex, "Failed to extract project_id from key file");
1008 }
1009
1010 return null;
1011 }
1012 }