1 package pl.psnc.dl.ege.tei;
2
3 import java.io.BufferedOutputStream;
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.FileNotFoundException;
8 import java.io.FileOutputStream;
9 import java.io.IOException;
10 import java.io.InputStream;
11 import java.io.OutputStream;
12 import java.io.OutputStreamWriter;
13 import java.io.Writer;
14 import java.util.UUID;
15 import java.util.zip.ZipOutputStream;
16
17 import javax.xml.transform.stream.StreamSource;
18
19 import net.sf.saxon.s9api.Processor;
20 import net.sf.saxon.s9api.QName;
21 import net.sf.saxon.s9api.SaxonApiException;
22 import net.sf.saxon.s9api.Serializer;
23 import net.sf.saxon.s9api.XdmAtomicValue;
24 import net.sf.saxon.s9api.XdmDestination;
25 import net.sf.saxon.s9api.XdmNode;
26 import net.sf.saxon.s9api.XsltCompiler;
27 import net.sf.saxon.s9api.XsltExecutable;
28 import net.sf.saxon.s9api.XsltTransformer;
29
30 import org.tei.docx.DocXPropertiesProvider;
31 import org.tei.exceptions.ConfigurationException;
32 import org.tei.utils.FileUtils;
33 import org.tei.utils.SaxonProcFactory;
34 import org.tei.utils.XMLUtils;
35
36 import pl.psnc.dl.ege.utils.EGEIOUtils;
37
38
39
40
41
42
43
44
45
46
47 class DocXConverter {
48
49 private String directoryName;
50
51 private File zipFile;
52
53 private File teiArchive;
54
55
56
57
58 private DocXPropertiesProvider propertiesProvider;
59
60 private String directoryNameURI;
61
62
63
64
65 private String[] archiveDirectoriesToCopy = new String[] { "word/media",
66 "word/embeddings", "word/fonts" };
67
68
69
70
71
72
73
74
75
76 public DocXConverter(DocXPropertiesProvider propertiesProvider)
77 throws ConfigurationException, IOException {
78 this.propertiesProvider = propertiesProvider;
79 initTemplate();
80 }
81
82
83
84
85
86
87
88
89
90
91 public DocXConverter(DocXPropertiesProvider pp, boolean toXml)
92 throws IOException, ConfigurationException {
93 this.propertiesProvider = pp;
94 if (!toXml) {
95 initTemplate();
96 } else {
97 String tmpDir = propertiesProvider.docx_pp_getTempDir();
98 String uid = UUID.randomUUID().toString();
99 directoryName = tmpDir + File.separator + uid;
100 this.directoryNameURI = new File(directoryName).toURI().toString();
101 }
102 }
103
104
105
106
107 private void initTemplate() throws IOException, ConfigurationException {
108
109 File templateFile = new File(propertiesProvider
110 .docx_pp_getDocXTemplateFile());
111 try {
112 InputStream in = new FileInputStream(templateFile);
113 unzipData(in);
114 } catch (FileNotFoundException e) {
115 ConfigurationException ic = new ConfigurationException(
116 "Could not load docx template at: "
117 + propertiesProvider.docx_pp_getDocXTemplateFile());
118 ic.initCause(e);
119 throw ic;
120 }
121 }
122
123
124
125
126
127
128
129
130 private void unzipData(InputStream in) throws FileNotFoundException,
131 IOException {
132
133 String tmpDir = propertiesProvider.docx_pp_getTempDir();
134
135 String uid = UUID.randomUUID().toString();
136 directoryName = tmpDir + File.separator + uid;
137 this.directoryNameURI = new File(directoryName).toURI().toString();
138 FileUtils.unzipFile(in, new File(directoryName));
139 }
140
141
142
143
144
145
146
147
148
149
150
151 public void docXToTEI(InputStream is, OutputStream os)
152 throws SaxonApiException, IOException {
153
154 String tmpDir = propertiesProvider.docx_pp_getTempDir();
155 String tmpArchiveDirName = tmpDir + File.separator
156 + UUID.randomUUID().toString();
157 File tmpArchiveDir = new File(tmpArchiveDirName);
158 tmpArchiveDir.mkdir();
159 try{
160 XdmNode tei = getTEI(is);
161 try {
162 XMLUtils.storeDocument(tei, new File(tmpArchiveDirName
163 + File.separator + "tei.xml"));
164 } catch (IOException ex) {
165 throw ex;
166 }
167
168
169 for (String dirName : archiveDirectoriesToCopy) {
170 File dir = new File(directoryName + File.separator + dirName);
171 if (dir.exists() && dir.isDirectory()) {
172
173 if (dirName.indexOf('/') != -1
174 && !dirName.substring(0, dirName.lastIndexOf('/'))
175 .equals("")) {
176 File dirToCreate = new File(tmpArchiveDirName
177 + File.separator
178 + dirName.substring(0, dirName.lastIndexOf('/')));
179 if (!dirToCreate.isDirectory())
180 dirToCreate.mkdirs();
181 }
182
183
184 try {
185 FileUtils.copyDir(dir, new File(tmpArchiveDirName
186 + File.separator
187 + dirName.substring(dirName.lastIndexOf('/'),dirName.length())));
188 } catch (IOException e) {
189 e.printStackTrace();
190 }
191 }
192 }
193
194 zipToStream(os, tmpArchiveDir);
195 }finally{
196 EGEIOUtils.deleteDirectory(tmpArchiveDir);
197 }
198 }
199
200
201
202
203 private XdmNode getTEI(InputStream is) throws FileNotFoundException,
204 IOException, SaxonApiException {
205
206 FileUtils.unzipFile(is, new File(directoryName));
207 File dxF = new File(directoryName + File.separator + "word"
208 + File.separator + "document.xml");
209 Processor proc = SaxonProcFactory.getProcessor();
210 net.sf.saxon.s9api.DocumentBuilder builder = proc.newDocumentBuilder();
211 XdmNode doc = builder.build(dxF);
212
213 XsltCompiler comp = proc.newXsltCompiler();
214 XsltExecutable normalizerExec = comp
215 .compile(new StreamSource(new File(propertiesProvider
216 .docx_pp_getStylesheetNormalizeWordStyles())));
217 XsltExecutable docx2teiExec = comp.compile(new StreamSource(new File(
218 propertiesProvider.docx_pp_getStylesheetDocx2TEI())));
219 XsltTransformer normalizer = normalizerExec.load();
220 XsltTransformer docx2tei = docx2teiExec.load();
221
222
223 normalizer.setParameter(new QName("word-directory"),
224 new XdmAtomicValue(directoryNameURI));
225 docx2tei.setParameter(new QName("word-directory"), new XdmAtomicValue(
226 directoryNameURI));
227
228
229 normalizer.setInitialContextNode(doc);
230 XdmDestination tmpDest = new XdmDestination();
231 normalizer.setDestination(tmpDest);
232 normalizer.transform();
233
234
235 XdmDestination result = new XdmDestination();
236 docx2tei.setInitialContextNode(tmpDest.getXdmNode());
237 docx2tei.setDestination(result);
238 docx2tei.transform();
239
240 return result.getXdmNode();
241 }
242
243
244
245
246
247
248 public void mergeTEI(XdmNode teiDoc) throws SaxonApiException,
249 FileNotFoundException, IOException {
250
251 Processor proc = SaxonProcFactory.getProcessor();
252 XsltCompiler comp = proc.newXsltCompiler();
253 XsltExecutable toDocXExec = comp.compile(new StreamSource(new File(
254 propertiesProvider.docx_pp_getStylesheetTEI2Docx())));
255 XsltTransformer toDocX = toDocXExec.load();
256
257 toDocX.setParameter(new QName("word-directory"), new XdmAtomicValue(
258 directoryNameURI));
259
260
261 File wordDotXMLFile = new File(directoryName + File.separator + "word"
262 + File.separator + "document.xml");
263 Serializer result = new Serializer();
264 Writer writer = new BufferedWriter(new OutputStreamWriter(
265 new FileOutputStream(wordDotXMLFile), "UTF-8"));
266 result.setOutputWriter(writer);
267 toDocX.setInitialContextNode(teiDoc);
268 toDocX.setDestination(result);
269 toDocX.transform();
270 writer.close();
271
272 File orgCoreFile = new File(directoryName + File.separator + "docProps"
273 + File.separator + "core.xml");
274 orgCoreFile.delete();
275
276 File newCoreFile = new File(directoryName + File.separator + "docProps"
277 + File.separator + "newcore.xml");
278 newCoreFile.renameTo(orgCoreFile);
279
280 }
281
282
283
284
285
286
287
288
289 public void zipToStream(OutputStream os, File dir) throws IOException {
290 ZipOutputStream zipOs = new ZipOutputStream(
291 new BufferedOutputStream(os));
292 EGEIOUtils.constructZip(dir, zipOs, "");
293 zipOs.close();
294 }
295
296 public void cleanUp() {
297
298 EGEIOUtils.deleteDirectory(new File(directoryName));
299
300
301 if (null != zipFile && zipFile.exists())
302 zipFile.delete();
303
304
305 if (null != teiArchive && teiArchive.exists())
306 teiArchive.delete();
307
308 }
309
310 public String getDirectoryName() {
311 return directoryName;
312 }
313
314 }