1 package com.acumenvelocity.ath.steps;
2
3 import java.util.ArrayList;
4 import java.util.List;
5
6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory;
8
9 import net.sf.okapi.common.Event;
10 import net.sf.okapi.common.IParameters;
11 import net.sf.okapi.common.ISegmenter;
12 import net.sf.okapi.common.StringUtil;
13 import net.sf.okapi.common.filters.IFilter;
14 import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
15 import net.sf.okapi.common.pipeline.annotations.StepParameterType;
16 import net.sf.okapi.common.resource.ITextUnit;
17 import net.sf.okapi.common.resource.RawDocument;
18 import net.sf.okapi.common.resource.TextPart;
19 import net.sf.okapi.lib.segmentation.SRXDocument;
20
21
22
23
24
25 public abstract class BaseAlignerStep extends BaseTuBatchProcessingStep {
26
27 protected final Logger LOGGER = LoggerFactory.getLogger(getClass());
28
29 private IFilter targetFilter;
30 private RawDocument targetInput = null;
31 private ISegmenter sourceSegmenter;
32 private ISegmenter targetSegmenter;
33
34 private final List<ITextUnit> sourceTUs = new ArrayList<>();
35 private final List<ITextUnit> targetTUs = new ArrayList<>();
36
37 public BaseAlignerStep(IFilter targetFilter) {
38 super();
39 this.targetFilter = targetFilter;
40 }
41
42 @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
43 public void setSecondInput(RawDocument secondInput) {
44 this.targetInput = secondInput;
45 }
46
47
48
49
50
51 @Override
52 public abstract IParameters getParameters();
53
54
55
56
57 protected abstract boolean isSegmentSource();
58
59
60
61
62 protected abstract boolean isSegmentTarget();
63
64
65
66
67 protected abstract boolean isUseCustomSourceRules();
68
69
70
71
72 protected abstract boolean isUseCustomTargetRules();
73
74
75
76
77 protected abstract String getCustomSourceRulesPath();
78
79
80
81
82 protected abstract String getCustomTargetRulesPath();
83
84
85
86
87 protected abstract boolean isCollapseWhitespace();
88
89
90
91
92
93 protected abstract void performAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs);
94
95 @Override
96 protected Event handleStartBatch(Event event) {
97 boolean loadDefault = true;
98 SRXDocument srxDocument = new SRXDocument();
99
100
101 if (isSegmentSource()) {
102 if (isUseCustomSourceRules()) {
103 try {
104 srxDocument.loadRules(getCustomSourceRulesPath());
105 loadDefault = false;
106
107 } catch (Exception e) {
108 LOGGER.warn(
109 "Custom source segmentation rules file '{}' cannot be read. Using the default rules instead.",
110 getCustomSourceRulesPath());
111 }
112 }
113
114 if (loadDefault) {
115 srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
116 }
117
118 sourceSegmenter = srxDocument.compileLanguageRules(getSourceLocale(), null);
119 }
120
121
122 if (isSegmentTarget()) {
123 loadDefault = true;
124
125 if (isUseCustomTargetRules()) {
126 try {
127 srxDocument.loadRules(getCustomTargetRulesPath());
128 loadDefault = false;
129
130 } catch (Exception e) {
131 LOGGER.warn(
132 "Custom target segmentation rules file '{}' cannot be read. Using the default rules instead.",
133 getCustomTargetRulesPath());
134 }
135 }
136
137 if (loadDefault) {
138 srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
139 }
140
141 targetSegmenter = srxDocument.compileLanguageRules(getTargetLocale(), null);
142 }
143
144 return event;
145 }
146
147 @Override
148 protected Event handleStartDocument(Event event) {
149 sourceTUs.clear();
150 targetTUs.clear();
151
152 if (targetInput != null) {
153 initializeTargetFilter();
154 }
155
156 return super.handleStartDocument(event);
157 }
158
159 @Override
160 protected Event handleTextUnit(Event event) {
161 ITextUnit tu = event.getTextUnit();
162
163 if (!tu.isTranslatable() || tu.isEmpty()) {
164 return Event.createNoopEvent();
165 }
166
167
168 if (isCollapseWhitespace()) {
169 for (TextPart p : tu.getSource().getSegments()) {
170 p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
171 }
172 }
173
174
175 if (isSegmentSource()) {
176 tu.createSourceSegmentation(sourceSegmenter);
177 }
178
179 sourceTUs.add(tu);
180 return Event.createNoopEvent();
181 }
182
183
184
185
186 protected void initializeTargetFilter() {
187 if (targetFilter != null && targetInput != null) {
188 targetFilter.open(targetInput);
189 }
190 }
191
192
193
194
195
196 protected void readAllTargetTUs() {
197 if (targetFilter == null) {
198 return;
199 }
200
201 while (targetFilter.hasNext()) {
202 Event event = targetFilter.next();
203
204 if (event.isTextUnit()) {
205 ITextUnit tu = event.getTextUnit();
206
207 if (tu.isTranslatable() && !tu.isEmpty()) {
208
209 if (isCollapseWhitespace()) {
210 for (TextPart p : tu.getSource().getSegments()) {
211 p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
212 }
213 }
214
215
216 if (isSegmentTarget()) {
217 tu.createSourceSegmentation(targetSegmenter);
218 }
219
220 targetTUs.add(tu);
221 }
222 }
223 }
224 }
225
226 @Override
227 protected void clear() {
228 sourceTUs.clear();
229 targetTUs.clear();
230
231 if (targetFilter != null) {
232 targetFilter.close();
233 }
234 }
235
236 @Override
237 protected void processTuEvents(List<Event> tuEvents) {
238
239 readAllTargetTUs();
240
241
242 performAlignment(sourceTUs, targetTUs);
243 }
244 }