View Javadoc
1   package net.sf.okapi.steps.llmsentencealigner;
2   
3   import com.acumenvelocity.ath.common.Const;
4   
5   import net.sf.okapi.common.ParametersDescription;
6   import net.sf.okapi.common.StringParameters;
7   
8   public class LlmSentenceAlignerParameters extends StringParameters {
9   
10    private static final String ALIGNMENT_MODEL_NAME = "alignmentModelName";
11    private static final String MAX_PARAGRAPHS_PER_REQUEST = "maxParagraphsPerRequest";
12    private static final String LOG_ALIGNMENT_DETAILS = "logAlignmentDetails";
13  
14    private static final String SEGMENT_SOURCE = "segmentSource";
15    private static final String USE_CUSTOM_SOURCE_RULES = "useCustomSourceRules";
16    private static final String CUSTOM_SOURCE_RULES_PATH = "customSourceRulesPath";
17    private static final String SEGMENT_TARGET = "segmentTarget";
18    private static final String USE_CUSTOM_TARGET_RULES = "useCustomTargetRules";
19    private static final String CUSTOM_TARGET_RULES_PATH = "customTargetRulesPath";
20    private static final String COLLAPSE_WHITESPACE = "collapseWhitespace";
21  
22    private static final String USE_CODES_REINSERTION_MODEL = "useCodesReinsertionModel";
23    private static final String CODES_REINSERTION_MODEL_NAME = "codesReinsertionModelName";
24  
25    public String getAlignmentModelName() {
26      return getString(ALIGNMENT_MODEL_NAME);
27    }
28  
29    public void setAlignmentModelName(String alignmentModelName) {
30      setString(ALIGNMENT_MODEL_NAME, alignmentModelName);
31    }
32  
33    public int getMaxParagraphsPerRequest() {
34      return getInteger(MAX_PARAGRAPHS_PER_REQUEST);
35    }
36  
37    public void setMaxParagraphsPerRequest(int maxParagraphsPerRequest) {
38      setInteger(MAX_PARAGRAPHS_PER_REQUEST, maxParagraphsPerRequest);
39    }
40  
41    public boolean isLogAlignmentDetails() {
42      return getBoolean(LOG_ALIGNMENT_DETAILS);
43    }
44  
45    public void setLogAlignmentDetails(boolean logAlignmentDetails) {
46      setBoolean(LOG_ALIGNMENT_DETAILS, logAlignmentDetails);
47    }
48  
49    public boolean isSegmentSource() {
50      return getBoolean(SEGMENT_SOURCE);
51    }
52  
53    public void setSegmentSource(boolean segmentSource) {
54      setBoolean(SEGMENT_SOURCE, segmentSource);
55    }
56  
57    public boolean isUseCustomSourceRules() {
58      return getBoolean(USE_CUSTOM_SOURCE_RULES);
59    }
60  
61    public void setUseCustomSourceRules(boolean useCustomSourceRules) {
62      setBoolean(USE_CUSTOM_SOURCE_RULES, useCustomSourceRules);
63    }
64  
65    public String getCustomSourceRulesPath() {
66      return getString(CUSTOM_SOURCE_RULES_PATH);
67    }
68  
69    public void setCustomSourceRulesPath(String customSourceRulesPath) {
70      setString(CUSTOM_SOURCE_RULES_PATH, customSourceRulesPath);
71    }
72  
73    public boolean isSegmentTarget() {
74      return getBoolean(SEGMENT_TARGET);
75    }
76  
77    public void setSegmentTarget(boolean segmentTarget) {
78      setBoolean(SEGMENT_TARGET, segmentTarget);
79    }
80  
81    public boolean isUseCustomTargetRules() {
82      return getBoolean(USE_CUSTOM_TARGET_RULES);
83    }
84  
85    public void setUseCustomTargetRules(boolean useCustomTargetRules) {
86      setBoolean(USE_CUSTOM_TARGET_RULES, useCustomTargetRules);
87    }
88  
89    public String getCustomTargetRulesPath() {
90      return getString(CUSTOM_TARGET_RULES_PATH);
91    }
92  
93    public void setCustomTargetRulesPath(String customTargetRulesPath) {
94      setString(CUSTOM_TARGET_RULES_PATH, customTargetRulesPath);
95    }
96  
97    public boolean isCollapseWhitespace() {
98      return getBoolean(COLLAPSE_WHITESPACE);
99    }
100 
101   public void setCollapseWhitespace(boolean collapseWhitespace) {
102     setBoolean(COLLAPSE_WHITESPACE, collapseWhitespace);
103   }
104 
105   public boolean isUseCodesReinsertionModel() {
106     return getBoolean(USE_CODES_REINSERTION_MODEL);
107   }
108 
109   public void setUseCodesReinsertionModel(boolean useCodesReinsertionModel) {
110     setBoolean(USE_CODES_REINSERTION_MODEL, useCodesReinsertionModel);
111   }
112 
113   public String getCodesReinsertionModelName() {
114     return getString(CODES_REINSERTION_MODEL_NAME);
115   }
116 
117   public void setCodesReinsertionModelName(String modelName) {
118     setString(CODES_REINSERTION_MODEL_NAME, modelName);
119   }
120 
121   @Override
122   public void reset() {
123     super.reset();
124     
125     setAlignmentModelName(Const.GEMINI_ALIGNMENT_MODEL);
126     setMaxParagraphsPerRequest(1000);
127     setLogAlignmentDetails(true);
128 
129     setSegmentSource(true);
130     setUseCustomSourceRules(false);
131     setCustomSourceRulesPath("");
132 
133     setSegmentTarget(true);
134     setUseCustomTargetRules(false);
135     setCustomTargetRulesPath("");
136     setCollapseWhitespace(false);
137 
138     setUseCodesReinsertionModel(false);
139     setCodesReinsertionModelName(Const.GEMINI_CODE_REINSERTION_MODEL);
140   }
141 
142   @Override
143   public ParametersDescription getParametersDescription() {
144     ParametersDescription desc = new ParametersDescription(this);
145 
146     desc.add(ALIGNMENT_MODEL_NAME,
147         "LLM model to use for alignment",
148         "The Gemini AI model name (e.g., 'gpt-4o-mini', 'gpt-4o')");
149 
150     desc.add(MAX_PARAGRAPHS_PER_REQUEST,
151         "Maximum paragraphs per LLM request",
152         "For very large documents, split into batches of this many paragraphs to avoid token limits.");
153 
154     desc.add(LOG_ALIGNMENT_DETAILS,
155         "Log detailed alignment information",
156         "If true, logs detailed information about alignments (useful for debugging but verbose).");
157 
158     desc.add(SEGMENT_SOURCE,
159         "Segment the source content (overriding possible existing segmentation)",
160         null);
161 
162     desc.add(USE_CUSTOM_SOURCE_RULES,
163         "Use custom source segmentation rules (instead of the default ones)",
164         null);
165 
166     desc.add(CUSTOM_SOURCE_RULES_PATH,
167         "SRX path for the source",
168         "Full path of the SRX document to use for the source");
169 
170     desc.add(SEGMENT_TARGET,
171         "Segment the target content (overriding possible existing segmentation)",
172         null);
173 
174     desc.add(USE_CUSTOM_TARGET_RULES,
175         "Use custom target segmentation rules (instead of the default ones)",
176         null);
177 
178     desc.add(CUSTOM_TARGET_RULES_PATH,
179         "SRX path for the target",
180         "Full path of the SRX document to use for the target");
181 
182     desc.add(COLLAPSE_WHITESPACE,
183         "Collapse whitespace",
184         "Collapse whitespace (space, newline etc.) to a single space before segmentation and alignment");
185 
186     desc.add(USE_CODES_REINSERTION_MODEL,
187         "Use LLM-based target codes reinsertion",
188         "True to use LLM-based target codes reinsertion, false to have Okapi reinsert. " +
189         "This option has effect only when plainTextMT is true.");
190 
191     desc.add(CODES_REINSERTION_MODEL_NAME,
192         "Gemini AI model name for target codes reinsertion",
193         "Gemini model to use for reinserting inline codes in the target text.");
194 
195     return desc;
196   }
197 }