View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import java.io.File;
4   import java.io.FileInputStream;
5   import java.io.IOException;
6   import java.io.InputStream;
7   import java.net.URI;
8   import java.nio.charset.StandardCharsets;
9   import java.nio.file.Files;
10  import java.nio.file.Paths;
11  import java.util.ArrayList;
12  import java.util.Collections;
13  import java.util.Comparator;
14  import java.util.Enumeration;
15  import java.util.Iterator;
16  import java.util.List;
17  import java.util.Map;
18  import java.util.ResourceBundle;
19  import java.util.TreeMap;
20  import java.util.UUID;
21  
22  import org.apache.solr.common.SolrDocument;
23  import org.apache.solr.common.SolrInputDocument;
24  import org.bson.types.ObjectId;
25  
26  import com.acumenvelocity.ath.common.exception.AthRuntimeException;
27  import com.acumenvelocity.ath.filters.pdf.AthPdfFilter;
28  import com.acumenvelocity.ath.filters.pdf.Parameters;
29  import com.acumenvelocity.ath.gcs.AthStorage;
30  import com.acumenvelocity.ath.model.EncodingInfo;
31  import com.acumenvelocity.ath.model.FilterInfo;
32  import com.acumenvelocity.ath.model.LanguageInfo;
33  import com.acumenvelocity.ath.model.MtResources;
34  import com.acumenvelocity.ath.model.OcrMode;
35  import com.acumenvelocity.ath.model.ParametersFormat;
36  import com.acumenvelocity.ath.solr.doc.SolrDocMergerStep;
37  import com.acumenvelocity.ath.solr.doc.SolrDocWriterStep;
38  import com.acumenvelocity.ath.solr.tm.SolrTmConnector;
39  import com.acumenvelocity.ath.srx.SrxFileMapper;
40  import com.acumenvelocity.ath.steps.BatchMtParameters;
41  import com.acumenvelocity.ath.steps.BatchMtStep;
42  import com.acumenvelocity.ath.steps.CodesReinsertionStep;
43  import com.acumenvelocity.ath.steps.MtConfidenceScoringStep;
44  import com.acumenvelocity.ath.steps.MtLeveragingStep;
45  import com.acumenvelocity.ath.steps.SegmentTrimmerStep;
46  import com.fasterxml.jackson.databind.JsonNode;
47  import com.google.auth.oauth2.GoogleCredentials;
48  import com.google.auth.oauth2.ServiceAccountCredentials;
49  
50  import io.swagger.oas.inflector.models.ResponseContext;
51  import net.sf.okapi.applications.rainbow.MainForm;
52  import net.sf.okapi.applications.rainbow.lib.EncodingItem;
53  import net.sf.okapi.applications.rainbow.lib.EncodingManager;
54  import net.sf.okapi.applications.rainbow.lib.LanguageItem;
55  import net.sf.okapi.applications.rainbow.lib.LanguageManager;
56  import net.sf.okapi.common.BOMNewlineEncodingDetector;
57  import net.sf.okapi.common.IParametersEditorMapper;
58  import net.sf.okapi.common.ListUtil;
59  import net.sf.okapi.common.LocaleId;
60  import net.sf.okapi.common.MimeTypeMapper;
61  import net.sf.okapi.common.StringUtil;
62  import net.sf.okapi.common.Util;
63  import net.sf.okapi.common.filters.FilterConfiguration;
64  import net.sf.okapi.common.filters.IFilter;
65  import net.sf.okapi.common.filters.IFilterConfigurationMapper;
66  import net.sf.okapi.common.filters.SharedFilterConfigurationMapper;
67  import net.sf.okapi.common.pipeline.IPipelineStep;
68  import net.sf.okapi.common.pipeline.PipelineReturnValue;
69  import net.sf.okapi.common.pipelinebuilder.XBatch;
70  import net.sf.okapi.common.pipelinebuilder.XBatchItem;
71  import net.sf.okapi.common.pipelinebuilder.XDocument;
72  import net.sf.okapi.common.pipelinebuilder.XParameter;
73  import net.sf.okapi.common.pipelinebuilder.XPipeline;
74  import net.sf.okapi.common.pipelinebuilder.XPipelineStep;
75  import net.sf.okapi.connectors.google.GoogleMTv2Connector;
76  import net.sf.okapi.connectors.google.GoogleMTv2Parameters;
77  import net.sf.okapi.steps.common.FilterEventsToRawDocumentStep;
78  import net.sf.okapi.steps.common.RawDocumentToFilterEventsStep;
79  import net.sf.okapi.steps.desegmentation.DesegmentationStep;
80  import net.sf.okapi.steps.heuristicaligner.HeuristicSentenceAlignerStep;
81  import net.sf.okapi.steps.leveraging.LeveragingStep;
82  import net.sf.okapi.steps.llmsentencealigner.LlmSentenceAlignerStep;
83  import net.sf.okapi.steps.segmentation.SegmentationStep;
84  
85  /**
86   * Utility class providing common controller functionality for the ATH application.
87   * This class manages filter configurations, language information, encoding details,
88   * parameter validation, and pagination utilities.
89   * 
90   * <p>
91   * Key responsibilities include:
92   * <ul>
93   * <li>Loading and managing Okapi filter configurations</li>
94   * <li>Providing language and encoding information lists</li>
95   * <li>Parameter validation for web service endpoints</li>
96   * <li>Pagination support for large data sets</li>
97   * </ul>
98   * 
99   * <p>
100  * This class is initialized statically and caches language, encoding, and filter
101  * information for performance. All methods are static and the class is not meant
102  * to be instantiated.
103  * 
104  * @author Acumen Velocity
105  * @version 1.0
106  * @since 1.0
107  */
108 public class ControllerUtil {
109   private static final String BUNDLE_NAME = "net.sf.okapi.common.filters.DefaultFilters";
110   private static String projectId;
111 
112   private static final List<LanguageInfo> languageInfos;
113   private static final List<EncodingInfo> encodingInfos;
114   private static final List<FilterInfo> filterInfos;
115 
116   private static IFilterConfigurationMapper fcMapper = new SharedFilterConfigurationMapper();
117   private static Map<String, String> extensionsMap = new TreeMap<>();
118 
119   static {
120     // We register our custom filters before the call of createFilterInfos(). That way the filter
121     // configuration mapper will skip adding the default Okapi configurations if their names are
122     // already registered with it
123     fcMapper.addConfigurations(AthPdfFilter.class.getName());
124 
125     languageInfos = createLanguageInfos();
126     encodingInfos = createEncodingInfos();
127     filterInfos = createFilterInfos();
128 
129     projectId = getProjectIdFromKeyFile();
130   }
131 
132   public static List<LanguageInfo> getLanguageInfos() {
133     return languageInfos;
134   }
135 
136   public static List<EncodingInfo> getEncodingInfos() {
137     return encodingInfos;
138   }
139 
140   public static List<FilterInfo> getFilterInfos() {
141     return filterInfos;
142   }
143 
144   public static IFilterConfigurationMapper getFcMapper() {
145     return fcMapper;
146   }
147 
148   public static boolean checkObjectIdParam(String param) {
149     return checkParam(param) && ObjectId.isValid(param);
150   }
151 
152   /**
153    * Add the default mappings provided in the DefaultFilters.properties file.
154    * 
155    * @param fcMapper          the mapper where to add the mapping.
156    * @param reset             true to clear all filters, editors and dec descriptions in the mapper
157    *                          before setting the new ones.
158    * @param addConfigurations true to add the filters configurations, false to add
159    *                          only the parameters editors and UI descriptions.
160    */
161   public static void setMappings(IFilterConfigurationMapper fcMapper, boolean reset,
162       boolean addConfigurations) {
163 
164     // Create the bundle and load it
165     ResourceBundle res = ResourceBundle.getBundle(BUNDLE_NAME);
166     Enumeration<String> keys = res.getKeys();
167     ArrayList<String> list = Collections.list(keys);
168 
169     if (reset) {
170       fcMapper.clearConfigurations(false);
171       ((IParametersEditorMapper) fcMapper).clearDescriptionProviders();
172       ((IParametersEditorMapper) fcMapper).clearEditors();
173     }
174 
175     // Go through the keys
176     for (String key : list) {
177       // Skip non-filterClass entries
178       if (!key.startsWith("filterClass"))
179         continue;
180 
181       try {
182         int n = key.indexOf('_');
183         String suffix = key.substring(n);
184         String value = res.getString(key);
185 
186         // Add the configurations for the filter
187         if (addConfigurations) {
188           fcMapper.addConfigurations(value);
189         }
190 
191         String key2 = "parametersClass" + suffix;
192 
193         if (list.contains(key2)) {
194           String paramsClass = res.getString(key2);
195 
196           // Add editor if available
197           String key3 = "parametersEditorClass" + suffix;
198 
199           if (list.contains(key3)) {
200             value = res.getString(key3);
201             ((IParametersEditorMapper) fcMapper).addEditor(value, paramsClass);
202 
203           } else { // Add editor descriptor if available
204             key3 = "editorDescriptionProvider" + suffix;
205 
206             if (list.contains(key3)) {
207               value = res.getString(key3);
208               ((IParametersEditorMapper) fcMapper).addDescriptionProvider(value, paramsClass);
209             }
210           }
211         }
212 
213       } catch (Exception ex) {
214         Log.warn(ControllerUtil.class,
215             "Error while trying to build filter for the property key '{}' -- {}", key,
216             ex.getMessage());
217 
218         continue;
219       }
220     }
221   }
222 
223   private static List<FilterInfo> createFilterInfos() {
224     List<FilterInfo> filterInfos = new ArrayList<>();
225     extensionsMap.clear();
226 
227     setMappings(fcMapper, false, true);
228     Iterator<FilterConfiguration> iter = fcMapper.getAllConfigurations();
229     StringBuilder sb = new StringBuilder();
230 
231     while (iter.hasNext()) {
232       FilterConfiguration fc = iter.next();
233 
234       try {
235         IFilter filter = fcMapper.createFilter(fc.configId);
236         sb.append(Log.format("{} -- {}\n", fc.configId, fc.name));
237 
238         String paramsSt = filter == null || filter.getParameters() == null ? ""
239             : filter.getParameters().toString();
240 
241         FilterInfo fi = new FilterInfo()
242             .id(fc.configId)
243             .name(fc.name)
244             .description(fc.description)
245             .custom(fc.custom)
246             .mimeType(fc.mimeType)
247             .fileExtensions(ListUtil.stringAsList(fc.extensions, ";"))
248             .parameters(paramsSt)
249             .parametersFormat(getFormat(paramsSt));
250 
251         for (String ext : fi.getFileExtensions()) {
252           extensionsMap.put(ext, fi.getId());
253         }
254 
255         filterInfos.add(fi);
256 
257       } catch (Exception e) {
258         Log.warn(ControllerUtil.class, "Cannot instantiate the filter '{}' -- {}", fc.filterClass,
259             e.getMessage());
260 
261         continue;
262       }
263     }
264 
265     Collections.sort(filterInfos,
266         Comparator.comparing(FilterInfo::getName, String.CASE_INSENSITIVE_ORDER));
267 
268     Log.debug(ControllerUtil.class, "\n\n------ Filter configs:\n{}\n\n", sb.toString());
269     return Collections.unmodifiableList(filterInfos);
270 
271   }
272 
273   private static ParametersFormat getFormat(String st) {
274     if (st.startsWith("#v")) {
275       return ParametersFormat.FPRM;
276 
277     } else if (st.startsWith("<?xml")) {
278       return ParametersFormat.XML;
279 
280     } else {
281       return ParametersFormat.YAML;
282     }
283   }
284 
285   private static List<EncodingInfo> createEncodingInfos() {
286     List<EncodingInfo> encodings = new ArrayList<>();
287     EncodingManager em = new EncodingManager();
288 
289     try (InputStream is = MainForm.class.getResourceAsStream("/shared/encodings.xml")) {
290       em.loadList(is);
291       EncodingItem ei;
292 
293       for (int i = 0; i < em.getCount(); i++) {
294         ei = em.getItem(i);
295 
296         EncodingInfo item = new EncodingInfo()
297             .name(ei.name)
298             .codePage(ei.codePage)
299             .ianaName(ei.ianaName);
300 
301         encodings.add(item);
302       }
303 
304     } catch (IOException e) {
305       Log.warn(ControllerUtil.class, "Error getting encodings -- {}", e.getMessage());
306     }
307 
308     return Collections.unmodifiableList(encodings);
309   }
310 
311   private static List<LanguageInfo> createLanguageInfos() {
312     List<LanguageInfo> langInfos = new ArrayList<>();
313 
314     LanguageManager lm = new LanguageManager();
315     LanguageItem li;
316 
317     for (int i = 0; i < lm.getCount(); i++) {
318       li = lm.getItem(i);
319 
320       LanguageInfo item = new LanguageInfo();
321 
322       item.setName(li.name);
323       item.setIsoCode(li.code);
324 
325       langInfos.add(item);
326     }
327 
328     return Collections.unmodifiableList(langInfos);
329   }
330 
331   public static boolean checkParam(String param) {
332     // Swagger inflector passes controllers the "null" string for
333     // endpoint null params (String#format() does it for {})
334     return !Util.isEmpty(param) && !"null".equalsIgnoreCase(param);
335   }
336 
337   public static boolean checkParam(Object param) {
338     return param != null;
339   }
340 
341   /**
342    * Returns a view (not a new list) of the sourceList for the range based on page
343    * and pageSize.
344    *
345    * @param sourceList
346    * @param page,      page number should start from 1
347    * @param pageSize
348    * @return custom error can be given instead of returning emptyList
349    */
350   public static <T> List<T> getPage(List<T> sourceList, int page, int pageSize) {
351     if (pageSize <= 0 || page <= 0) {
352       throw new IllegalArgumentException("Invalid page size: " + pageSize);
353     }
354 
355     int fromIndex = (page - 1) * pageSize;
356 
357     if (sourceList == null || sourceList.size() <= fromIndex) {
358       return Collections.emptyList();
359     }
360 
361     // toIndex exclusive
362     return sourceList.subList(fromIndex, Math.min(fromIndex + pageSize, sourceList.size()));
363   }
364 
365   public static String getFilterId(String fileExt) {
366     return extensionsMap.get(fileExt);
367   }
368 
369   public static boolean checkUuidParam(String param) {
370     return checkParam(param) && isValidUuid(param);
371   }
372 
373   private static boolean isValidUuid(String uuid) {
374     // UUID pattern: 8-4-4-4-12 hex digits with optional hyphens
375     String uuidPattern = "^[0-9a-fA-F]{8}-?[0-9a-fA-F]{4}-?[0-9a-fA-F]{4}-?[0-9a-fA-F]{4}-?"
376         + "[0-9a-fA-F]{12}$";
377 
378     return uuid.matches(uuidPattern);
379   }
380 
381   public static ResponseContext importFile(
382       SolrInputDocument doc,
383       UUID docId,
384       String docFileName,
385       URI docGcsUrl,
386       String docEncoding,
387       String srcLang,
388       String trgLang,
389       String filterId,
390       String filterParams,
391       String srcSrx,
392       UUID tmId,
393       Integer tmThreshold,
394       String mtEngineId,
395       String mtEngineParams,
396       List<MtResources> mtCustomResources,
397       boolean mtProvideConfidenceScores,
398       boolean mtUseTranslateLlm,
399       boolean mtSendPlainText,
400       boolean useCodesReinsertionModel,
401       String codesReinsertionModelName,
402       UUID userId,
403       boolean newDoc) throws Exception {
404 
405     IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
406 
407     if (Util.isEmpty(filterId)) {
408       // Filter is not specified, figure it out from the file extension
409       String fileExt = Util.getExtension(docFileName);
410 
411       if (Util.isEmpty(fileExt)) {
412         throw new AthRuntimeException("No file extension or filterId specified");
413 
414       } else {
415         filterId = ControllerUtil.getFilterId(fileExt);
416 
417         if (Util.isEmpty(filterId)) {
418           throw new AthRuntimeException("Unknown file extension");
419         }
420       }
421     }
422 
423     try (IFilter filter = fcMapper.createFilter(filterId)) {
424       if (!Util.isEmpty(filterParams)) {
425         // Filters are expected to instantiate their parameters
426         filter.getParameters().fromString(filterParams);
427       }
428 
429       doc.setField(Const.ATH_PROP_FILTER_ID, filterId);
430 
431       if (filterParams == null && filter.getParameters() != null) {
432         filterParams = filter.getParameters().toString();
433       }
434 
435       if (filterParams != null) {
436         doc.setField(Const.ATH_PROP_FILTER_PARAMS, filterParams);
437       }
438 
439       Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
440 
441       if (Util.isEmpty(srcSrx)) {
442         srcSrx = getSrx(srcLang);
443       }
444 
445       doc.setField(Const.ATH_PROP_SRC_SRX, srcSrx);
446 
447       LeveragingStep ls = tmId == null ? null : new LeveragingStep();
448 
449       if (ls != null) {
450         net.sf.okapi.steps.leveraging.Parameters lsParams = ls.getParameters();
451         lsParams.setResourceClassName(SolrTmConnector.class.getName());
452 
453         com.acumenvelocity.ath.solr.tm.Parameters connParams = new com.acumenvelocity.ath.solr.tm.Parameters();
454         connParams.setTmId(tmId.toString());
455 
456         lsParams.setResourceParameters(connParams.toString());
457         lsParams.setThreshold(tmThreshold);
458         lsParams.setFillTarget(true);
459         lsParams.setFillTargetThreshold(tmThreshold);
460       }
461 
462       boolean useMt = !Util.isEmpty(mtEngineId); // mtEngineParams can be null (defaults to be used)
463       IPipelineStep mts = null;
464 
465       if (useMt) {
466         switch (mtEngineId) {
467         case Const.MT_PROVIDER_GOOGLE_MT:
468           MtLeveragingStep v2Mts = new MtLeveragingStep(mtSendPlainText);
469           mts = v2Mts;
470 
471           net.sf.okapi.steps.leveraging.Parameters lsParams = v2Mts.getParameters();
472           lsParams.setResourceClassName(GoogleMTv2Connector.class.getName());
473 
474           if (Util.isEmpty(mtEngineParams)) {
475             GoogleMTv2Parameters connParams = new GoogleMTv2Parameters();
476             connParams.setApiKey(Const.ATH_GCT_API_KEY);
477             connParams.setRetryIntervalMs(0);
478             connParams.setRetryCount(1);
479 
480             lsParams.setResourceParameters(connParams.toString());
481 
482           } else {
483             lsParams.setResourceParameters(mtEngineParams);
484           }
485 
486           break;
487 
488         case Const.MT_PROVIDER_GOOGLE_MT_V3:
489           BatchMtStep bmts = new BatchMtStep();
490           mts = bmts;
491 
492           BatchMtParameters bmtsParams = bmts.getParameters();
493 
494           bmtsParams.setApiKey(Const.ATH_GCT_API_KEY);
495           bmtsParams.setProjectLocation(Const.ATH_GCP_PROJECT_LOCATION);
496           bmtsParams.setCredentialsPath(Const.ATH_GCP_SECRET_FILE);
497 
498           boolean isCustomModel = mtCustomResources != null && mtCustomResources.size() > 0;
499 
500           if (isCustomModel) {
501             MtResources res = mtCustomResources.get(0);
502 
503             bmtsParams.setGlossaryProjectId(res.getMtGlossaryProjectId());
504             bmtsParams.setGlossaryProjectLocation(res.getMtGlossaryProjectLocation());
505             bmtsParams.setGlossaryId(res.getMtGlossaryId());
506             bmtsParams.setModelProjectId(res.getMtModelProjectId());
507             bmtsParams.setModelProjectLocation(res.getMtModelProjectLocation());
508             bmtsParams.setModelId(res.getMtModelId());
509           }
510 
511           bmtsParams.setMimeType(mtSendPlainText ? MimeTypeMapper.PLAIN_TEXT_MIME_TYPE
512               : MimeTypeMapper.HTML_MIME_TYPE);
513 
514           bmtsParams.setRetryIntervalMs(0);
515           bmtsParams.setRetryCount(1);
516           bmtsParams.setFailuresBeforeAbort(-1);
517 
518           bmtsParams.setMtUseTranslateLlm(mtUseTranslateLlm);
519           bmtsParams.setMtSendPlainText(mtSendPlainText);
520           break;
521 
522         default:
523           break;
524         }
525       }
526 
527       // try (InputStream is = AthStorage.getInputStream(docGcsUrl)) {
528       // File tempFile = AthUtil.createTempFile();
529       // StreamUtil.copy(is, tempFile);
530       // tempFile.delete();
531       // }
532 
533       // Detect the need of OCR for PDF if the OCR mode is set to AUTO (auto-detect, default)
534       // If not set to AUTO, then follow the explicit user's setting for the OCR mode
535       if (filter instanceof AthPdfFilter) {
536         Parameters params = (Parameters) filter.getParameters();
537 
538         if (params != null && params.getOcrMode() == OcrMode.AUTO) {
539           File docFile = AthUtil.createTempFile();
540 
541           try {
542             AthStorage.storeFile(docGcsUrl, MimeTypeMapper.PDF_MIME_TYPE, docFile);
543             boolean needsOcr = PdfUtil.needsOcr(docFile);
544 
545             if (needsOcr) {
546               params.setOcrMode(OcrMode.ENABLED);
547 
548             } else {
549               params.setOcrMode(OcrMode.DISABLED);
550             }
551 
552           } catch (Exception e) {
553             Log.error(ControllerUtil.class, e, "PDF conversion error");
554 
555           } finally {
556             try {
557               Files.deleteIfExists(docFile.toPath());
558 
559             } catch (Exception ignored) {
560             }
561           }
562         }
563       }
564 
565       // Import the document
566       try (InputStream is = AthStorage.getInputStream(docGcsUrl);
567 
568           XPipeline pl = new XPipeline(
569               "Import pipeline",
570 
571               new XBatch(
572                   new XBatchItem(
573                       new XDocument(
574                           is,
575                           docEncoding,
576                           LocaleId.fromString(srcLang),
577                           LocaleId.fromString(trgLang)))),
578 
579               new RawDocumentToFilterEventsStep(filter),
580 
581               new XPipelineStep(
582                   new SegmentationStep(),
583                   new XParameter("sourceSrx", srcSrx),
584                   new XParameter("targetSrx", null),
585                   new XParameter("segmentSource", !Util.isEmpty(srcSrx)),
586                   new XParameter("segmentTarget", false),
587                   new XParameter("copySource", false)),
588 
589               new SegmentTrimmerStep(),
590               ls,
591 
592               mtProvideConfidenceScores
593                   ? new MtConfidenceScoringStep(mtCustomResources, mtSendPlainText)
594                   : mts,
595 
596               mtSendPlainText ?
597               // MT step (mts) removes codes if mtSendPlainText is true, re-insert
598               // The confidence scores provider works with plain target text, re-insert
599                   new CodesReinsertionStep(useCodesReinsertionModel, codesReinsertionModelName)
600                   : null,
601 
602               new SolrDocWriterStep(docId, docFileName, userId, newDoc))) {
603 
604         PipelineReturnValue res = pl.execute();
605 
606         if (res == PipelineReturnValue.SUCCEDED) {
607           return Response.success(200);
608         }
609       }
610 
611       return Response.error(500, "Import failed");
612 
613     } catch (Exception e) {
614       String st = newDoc ? "Import failed" : "Update failed";
615       Log.error(ControllerUtil.class, e, st);
616       return Response.error(500, e, st);
617     }
618   }
619 
620   public static ResponseContext alignFile(
621       SolrInputDocument doc,
622       UUID docId,
623       File docFile,
624       String docFileName,
625       URI docGcsUrl,
626       String docEncoding,
627       URI docTrlGcsUrl,
628       String docTrlEncoding,
629       String srcLang,
630       String trgLang,
631       String filterId,
632       String filterParams,
633       String srcSrx,
634       String trgSrx,
635       boolean mtSendPlainText,
636       boolean useAlignmentModel,
637       String alignmentModelName,
638       boolean useCodesReinsertionModel,
639       String codesReinsertionModelName,
640       UUID userId) throws Exception {
641 
642     boolean alignWithTranslation = docTrlGcsUrl != null;
643     IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
644 
645     if (Util.isEmpty(filterId)) {
646       // Filter is not specified, figure it out from the file extension
647       String fileExt = Util.getExtension(docFileName);
648 
649       if (Util.isEmpty(fileExt)) {
650         throw new AthRuntimeException("No file extension or filterId specified");
651 
652       } else {
653         filterId = ControllerUtil.getFilterId(fileExt);
654 
655         if (Util.isEmpty(filterId)) {
656           throw new AthRuntimeException("Unknown file extension");
657         }
658       }
659     }
660 
661     try (IFilter filter = fcMapper.createFilter(filterId)) {
662       if (!Util.isEmpty(filterParams)) {
663         // Filters are expected to instantiate their parameters
664         filter.getParameters().fromString(filterParams);
665       }
666 
667       doc.setField(Const.ATH_PROP_FILTER_ID, filterId);
668 
669       if (filterParams == null && filter.getParameters() != null) {
670         filterParams = filter.getParameters().toString();
671       }
672 
673       if (filterParams != null) {
674         doc.setField(Const.ATH_PROP_FILTER_PARAMS, filterParams);
675       }
676 
677       Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
678 
679       if (Util.isEmpty(srcSrx)) {
680         srcSrx = getSrx(srcLang);
681       }
682 
683       doc.setField(Const.ATH_PROP_SRC_SRX, srcSrx);
684 
685       if (Util.isEmpty(trgSrx)) {
686         if (alignWithTranslation) {
687           trgSrx = getSrx(trgLang);
688 
689         } else {
690           // No target segmentation requested otherwise
691         }
692       }
693 
694       if (!Util.isEmpty(trgSrx)) {
695         doc.setField(Const.ATH_PROP_TRG_SRX, trgSrx);
696       }
697 
698       // Detect the need of OCR for PDF if the OCR mode is set to AUTO (auto-detect, default)
699       // If not set to AUTO, then follow the explicit user's setting for the OCR mode
700       if (filter instanceof AthPdfFilter && docFile != null) {
701         Parameters params = (Parameters) filter.getParameters();
702 
703         if (params != null && params.getOcrMode() == OcrMode.AUTO) {
704           boolean needsOcr = PdfUtil.needsOcr(docFile);
705 
706           if (needsOcr) {
707             params.setOcrMode(OcrMode.ENABLED);
708 
709           } else {
710             params.setOcrMode(OcrMode.DISABLED);
711           }
712         }
713       }
714 
715       if (alignWithTranslation) {
716         // We have to use temporary files because SentenceAlignerStep takes only file paths for SRX
717         File srcSrxFile = AthUtil.createTempFile();
718         File trgSrxFile = AthUtil.createTempFile();
719 
720         try {
721           // srcSrx and trgSrx are guaranteed to be set here
722           StringUtil.writeString(srcSrx, srcSrxFile);
723           StringUtil.writeString(trgSrx, trgSrxFile);
724 
725           try (InputStream sis = AthStorage.getInputStream(docGcsUrl);
726               InputStream tis = AthStorage.getInputStream(docTrlGcsUrl);
727               IFilter targetFilter = fcMapper.createFilter(filterId);
728 
729               XPipeline pl = new XPipeline(
730                   "Alignment pipeline",
731 
732                   new XBatch(
733                       new XBatchItem(
734                           new XDocument(
735                               sis,
736                               docEncoding,
737                               LocaleId.fromString(srcLang),
738                               LocaleId.fromString(trgLang)),
739 
740                           new XDocument(
741                               tis,
742                               docTrlEncoding,
743                               LocaleId.fromString(trgLang),
744                               LocaleId.fromString(trgLang)))),
745 
746                   new RawDocumentToFilterEventsStep(filter),
747 
748                   !useAlignmentModel
749                       ? new XPipelineStep(
750                           new HeuristicSentenceAlignerStep(targetFilter),
751                           new XParameter("generateTMX", false),
752                           new XParameter("tmxOutputPath", null),
753                           new XParameter("collapseWhitespace", false),
754                           new XParameter("outputOneTOneMatchesOnly", false),
755                           new XParameter("forceSimpleOneToOneAlignment", false),
756                           new XParameter("segmentSource", true),
757                           new XParameter("useCustomSourceRules", true),
758                           new XParameter("customSourceRulesPath", srcSrxFile.getAbsolutePath()),
759                           new XParameter("segmentTarget", true),
760                           new XParameter("useCustomTargetRules", true),
761                           new XParameter("customTargetRulesPath", trgSrxFile.getAbsolutePath()))
762 
763                       : new XPipelineStep(
764                           new LlmSentenceAlignerStep(targetFilter), // !!! targetFilter, not filter
765                           new XParameter("useLlmAlignment", true),
766                           new XParameter("llmModel", alignmentModelName),
767                           new XParameter("useCodesReinsertionModel", useCodesReinsertionModel),
768                           new XParameter("maxParagraphsPerRequest", 1000),
769                           new XParameter("logAlignmentDetails", true),
770                           new XParameter("collapseWhitespace", true),
771 
772                           new XParameter("segmentSource", !Util.isEmpty(srcSrx)),
773                           new XParameter("useCustomSourceRules", !Util.isEmpty(srcSrx)),
774                           new XParameter("customSourceRulesPath", srcSrxFile.getAbsolutePath()),
775 
776                           new XParameter("segmentTarget", !Util.isEmpty(trgSrx)),
777                           new XParameter("useCustomTargetRules", !Util.isEmpty(trgSrx)),
778                           new XParameter("customTargetRulesPath", trgSrxFile.getAbsolutePath())),
779 
780                   // LlmSentenceAlignerStep removes codes when useCodesReinsertionModel == true,
781                   // re-insert
782                   // If useCodesReinsertionModel == false and the codes were not removed by previous
783                   // steps, we only check the target codes against the source ones, no harm, just an
784                   // extra check
785                   new CodesReinsertionStep(useCodesReinsertionModel, codesReinsertionModelName),
786                   new SolrDocWriterStep(docId, docFileName, userId, false))) {
787 
788             // // Set filter config to the 2nd input
789             // RawDocument trl = pl.getBatch().getItems().get(0).getRawDocument(1);
790             // trl.setFilterConfigId(filterId);
791 
792             PipelineReturnValue res = pl.execute();
793 
794             if (res == PipelineReturnValue.SUCCEDED) {
795               return Response.success(200);
796             }
797           }
798 
799         } finally {
800           if (srcSrxFile != null) {
801             srcSrxFile.delete();
802           }
803 
804           if (trgSrxFile != null) {
805             trgSrxFile.delete();
806           }
807         }
808       }
809 
810       return Response.error(500, "Alignment failed");
811 
812     } catch (Exception e) {
813       String st = "Alignment failed";
814       Log.error(ControllerUtil.class, e, st);
815       return Response.error(500, e, st);
816     }
817   }
818 
819   /**
820    * Get SRX from a file in the SRX repository, fall back to the default SRX rules.
821    * 
822    * @param lang
823    * @return
824    */
825   private static String getSrx(String lang) {
826     String fn = SrxFileMapper.getSrxFileName(lang);
827 
828     String srxFileName = (fn == null)
829         ? "/srx/alternate-default.srx"
830         // FIXME use custom SRX files instead of the possibly-non-licensed MemoQ ones
831         : "/srx/memoq/default/v9.0/" + fn;
832 
833     try (InputStream in = ControllerUtil.class.getResourceAsStream(srxFileName)) {
834       if (in == null) {
835         throw new IllegalArgumentException("SRX file not found: " + srxFileName);
836       }
837 
838       // --- Use Okapi’s detector to handle BOM ---
839       BOMNewlineEncodingDetector detector = new BOMNewlineEncodingDetector(in,
840           StandardCharsets.UTF_8);
841 
842       detector.detectAndRemoveBom(); // Detect + skip BOM if present
843 
844       // --- Read remaining content as string ---
845       byte[] data = detector.getInputStream().readAllBytes();
846       return new String(data, StandardCharsets.UTF_8);
847 
848     } catch (Exception e) {
849       throw new RuntimeException("Failed to read SRX file: " + srxFileName, e);
850     }
851   }
852 
853   public static ResponseContext exportFile(
854       SolrDocument doc,
855       SolrDocument tmDoc,
856       SolrInputDocument updateDoc,
857       URI docOutGcsUrl,
858       String docOutEncoding,
859       UUID tmId,
860       UUID userId) throws Exception {
861 
862     String catFrameworkName = SolrUtil.safeGetField(doc, Const.ATH_PROP_CAT_FRAMEWORK_NAME,
863         null);
864 
865     String catFrameworkVersion = SolrUtil.safeGetField(doc, Const.ATH_PROP_CAT_FRAMEWORK_VERSION,
866         null);
867 
868     // String tmFileName = SolrUtil.safeGetField(tmDoc, Const.ATH_PROP_TM_FILE_NAME, null);
869 
870     // Check if the import has been performed with the same Okapi version as the current one. If
871     // not, the code of the filter could have changed between the versions, now generates
872     // different events, and the translated TUs get un-mergeable with the current version filter
873     // events.
874     //
875     // XXX To avoid this, keep multiple ATH-Okapi API Docker containers, a separate image for each
876     // Okapi version (tag them as "1.47.0" etc.), parse the document data in ath_state to figure out
877     // the Okapi version that was used for import, and use the container for that given version for
878     // document export.
879 
880     if (catFrameworkName != null && !catFrameworkName.equals(Const.CAT_FRAMEWORK_NAME)) {
881       throw new AthRuntimeException("CAT framework mismatch -- expected: {}, actual: {}",
882           catFrameworkName, Const.CAT_FRAMEWORK_NAME);
883     }
884 
885     if (catFrameworkVersion != null && !catFrameworkVersion.equals(Const.CAT_FRAMEWORK_VERSION)) {
886       Log.warn(ControllerUtil.class,
887           "Export success is not guaranteed because of the {} version mismatch -- expected: {}, "
888               + "actual: {}",
889           Const.CAT_FRAMEWORK_NAME, catFrameworkVersion, Const.CAT_FRAMEWORK_VERSION);
890     }
891 
892     IFilterConfigurationMapper fcMapper = ControllerUtil.getFcMapper();
893 
894     String docId = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_ID, null);
895     String filterId = SolrUtil.safeGetField(doc, Const.ATH_PROP_FILTER_ID, null);
896     String filterParams = SolrUtil.safeGetField(doc, Const.ATH_PROP_FILTER_PARAMS, null);
897     String docStorageName = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_STORAGE_NAME, null);
898 
899     String docFileEncoding = SolrUtil.safeGetField(doc, Const.ATH_PROP_DOC_FILE_ENCODING,
900         null);
901 
902     String srcLang = SolrUtil.safeGetField(doc, Const.ATH_PROP_SRC_LANG, null);
903     String trgLang = SolrUtil.safeGetField(doc, Const.ATH_PROP_TRG_LANG, null);
904     // String srcSrx = SolrUtil.safeGetField(doc, Const.ATH_PROP_SRC_SRX, null);
905     // String trgSrx = SolrUtil.safeGetField(doc, Const.ATH_PROP_TRG_SRX, null);
906 
907     IFilter filter = fcMapper.createFilter(filterId);
908 
909     // Filters are expected to instantiate their parameters
910     filter.getParameters().fromString(filterParams);
911 
912     Log.info(ControllerUtil.class, "{} (id='{}') was created", filter.getName(), filterId);
913 
914     File outFile = null;
915 
916     try {
917       outFile = AthUtil.createTempFile();
918 
919       try (InputStream is = AthStorage.getInputStream(AthUtil.toURI(docStorageName));
920           XPipeline pl = new XPipeline(
921               "Export pipeline",
922 
923               new XBatch(
924                   new XBatchItem(
925                       new XDocument(
926                           is,
927                           docFileEncoding,
928                           outFile.getAbsolutePath(),
929                           docOutEncoding,
930                           LocaleId.fromString(srcLang),
931                           LocaleId.fromString(trgLang)))),
932 
933               new RawDocumentToFilterEventsStep(filter),
934 
935               // new XPipelineStep(
936               // new SegmentationStep(),
937               // new XParameter("sourceSrx", srcSrx),
938               // new XParameter("targetSrx", trgSrx),
939               // new XParameter("segmentSource", !Util.isEmpty(srcSrx)),
940               // new XParameter("segmentTarget", !Util.isEmpty(trgSrx)),
941               // new XParameter("copySource", false)),
942               //
943               // new SegmentTrimmerStep(),
944 
945               // TODO move to a separate TM import PL/call
946               // tmId == null ? null
947               // : new SolrTmWriterStep(tmId, tmFileName, userId, false),
948 
949               new SolrDocMergerStep(docId),
950               new DesegmentationStep(),
951               new FilterEventsToRawDocumentStep())) {
952 
953         PipelineReturnValue res = pl.execute();
954 
955         if (res == PipelineReturnValue.SUCCEDED) {
956           // Store translation
957           AthStorage.storeFile(docOutGcsUrl, filter.getMimeType(), outFile);
958 
959           return Response.success(200);
960         }
961       }
962 
963       return Response.error(500, "Export failed");
964 
965     } catch (Exception e) {
966       Log.error(ControllerUtil.class, e, "Export failed");
967       return Response.error(500, e, "Export failed");
968     }
969   }
970 
971   public static String getProjectId() {
972     return projectId;
973   }
974 
975   /**
976    * Gets the projectId from the service account key file at Const.ATH_GCP_SECRET_FILE.
977    * Never returns null – throws descriptive exception on failure.
978    */
979   private static String getProjectIdFromKeyFile() {
980     try (FileInputStream fis = new FileInputStream(Const.ATH_GCP_SECRET_FILE)) {
981       GoogleCredentials credentials = GoogleCredentials.fromStream(fis);
982 
983       if (credentials instanceof ServiceAccountCredentials) {
984         ServiceAccountCredentials sac = (ServiceAccountCredentials) credentials;
985         String projectId = sac.getProjectId();
986 
987         if (projectId != null && !projectId.isBlank()) {
988           return projectId;
989         }
990       }
991 
992     } catch (Exception e) {
993       // We intentionally catch everything here and fall back to manual parsing
994       // This makes the method resilient to future Google library changes
995       Log.warn(ConversionUtil.class,
996           "Official credential parsing failed ({}), falling back to JSON parsing",
997           e.getMessage());
998     }
999 
1000     // Fallback: manual JSON parsing (works even if Google library changes)
1001     try {
1002       String json = Files.readString(Paths.get(Const.ATH_GCP_SECRET_FILE));
1003       JsonNode jsonNode = JacksonUtil.makeNode(json);
1004       return jsonNode.get(json) == null ? null : jsonNode.get(json).asText();
1005 
1006     } catch (Exception ex) {
1007       Log.error(ConversionUtil.class, ex, "Failed to extract project_id from key file");
1008     }
1009 
1010     return null;
1011   }
1012 }