View Javadoc

1   package pl.psnc.dl.ege.tei;
2   
3   import java.io.BufferedOutputStream;
4   import java.io.BufferedWriter;
5   import java.io.File;
6   import java.io.FileInputStream;
7   import java.io.FileNotFoundException;
8   import java.io.FileOutputStream;
9   import java.io.IOException;
10  import java.io.InputStream;
11  import java.io.OutputStream;
12  import java.io.OutputStreamWriter;
13  import java.io.Writer;
14  import java.util.UUID;
15  import java.util.zip.ZipOutputStream;
16  
17  import javax.xml.transform.stream.StreamSource;
18  
19  import net.sf.saxon.s9api.Processor;
20  import net.sf.saxon.s9api.QName;
21  import net.sf.saxon.s9api.SaxonApiException;
22  import net.sf.saxon.s9api.Serializer;
23  import net.sf.saxon.s9api.XdmAtomicValue;
24  import net.sf.saxon.s9api.XdmDestination;
25  import net.sf.saxon.s9api.XdmNode;
26  import net.sf.saxon.s9api.XsltCompiler;
27  import net.sf.saxon.s9api.XsltExecutable;
28  import net.sf.saxon.s9api.XsltTransformer;
29  
30  import org.tei.docx.DocXPropertiesProvider;
31  import org.tei.exceptions.ConfigurationException;
32  import org.tei.utils.FileUtils;
33  import org.tei.utils.SaxonProcFactory;
34  import org.tei.utils.XMLUtils;
35  
36  import pl.psnc.dl.ege.utils.EGEIOUtils;
37  
38  /**
39   * <p>
40   * Class responsible for DocX document transformation operations.
41   * </p>
42   * Performs transformation from DocX to TEI XML format and vice versa.
43   * 
44   * @author mariuszs
45   * 
46   */
47  class DocXConverter {
48  
49  	private String directoryName;
50  
51  	private File zipFile;
52  
53  	private File teiArchive;
54  
55  	/**
56  	 * provides all the properties that we might need.
57  	 */
58  	private DocXPropertiesProvider propertiesProvider;
59  
60  	private String directoryNameURI;
61  
62  	/**
63  	 * Defines which directories are copied to and from the archive
64  	 */
65  	private String[] archiveDirectoriesToCopy = new String[] { "word/media",
66  			"word/embeddings", "word/fonts" };
67  
68  	/**
69  	 * Constructs a docx object from the empty template
70  	 * 
71  	 * @param teiDoc
72  	 * @param propertiesProvider
73  	 * @throws ConfigurationException
74  	 * @throws IOException
75  	 */
76  	public DocXConverter(DocXPropertiesProvider propertiesProvider)
77  			throws ConfigurationException, IOException {
78  		this.propertiesProvider = propertiesProvider;
79  		initTemplate();
80  	}
81  
82  	/**
83  	 * Alternative constructor : can specify if created object is for DocX->XML
84  	 * conversion or XML->DocX.
85  	 * 
86  	 * @param pp
87  	 * @param toXml
88  	 * @throws IOException
89  	 * @throws ConfigurationException
90  	 */
91  	public DocXConverter(DocXPropertiesProvider pp, boolean toXml)
92  			throws IOException, ConfigurationException {
93  		this.propertiesProvider = pp;
94  		if (!toXml) {
95  			initTemplate();
96  		} else {
97  			String tmpDir = propertiesProvider.docx_pp_getTempDir();
98  			String uid = UUID.randomUUID().toString();
99  			directoryName = tmpDir + File.separator + uid;
100 			this.directoryNameURI = new File(directoryName).toURI().toString();
101 		}
102 	}
103 
104 	/*
105 	 * Initialization for XML->DocX transformation - unpacking template file.
106 	 */
107 	private void initTemplate() throws IOException, ConfigurationException {
108 		// copy template somewhere
109 		File templateFile = new File(propertiesProvider
110 				.docx_pp_getDocXTemplateFile());
111 		try {
112 			InputStream in = new FileInputStream(templateFile);
113 			unzipData(in);
114 		} catch (FileNotFoundException e) {
115 			ConfigurationException ic = new ConfigurationException(
116 					"Could not load docx template at: "
117 							+ propertiesProvider.docx_pp_getDocXTemplateFile());
118 			ic.initCause(e);
119 			throw ic;
120 		}
121 	}
122 
123 	/**
124 	 * Unzips the .docx file
125 	 * 
126 	 * @param in
127 	 * @throws IOException
128 	 * @throws FileNotFoundException
129 	 */
130 	private void unzipData(InputStream in) throws FileNotFoundException,
131 			IOException {
132 		// where should we unzip the file to
133 		String tmpDir = propertiesProvider.docx_pp_getTempDir();
134 		// name of the directory
135 		String uid = UUID.randomUUID().toString();
136 		directoryName = tmpDir + File.separator + uid;
137 		this.directoryNameURI = new File(directoryName).toURI().toString();
138 		FileUtils.unzipFile(in, new File(directoryName));
139 	}
140 
141 	/**
142 	 * Constructs XML TEI document from DocX version.
143 	 * 
144 	 * @param is
145 	 * @param os
146 	 * @return
147 	 * @throws SaxonApiException
148 	 * @throws IOException
149 	 * @throws FileNotFoundException
150 	 */
151 	public void docXToTEI(InputStream is, OutputStream os)
152 			throws SaxonApiException, IOException {
153 
154 		String tmpDir = propertiesProvider.docx_pp_getTempDir();
155 		String tmpArchiveDirName = tmpDir + File.separator
156 				+ UUID.randomUUID().toString();
157 		File tmpArchiveDir = new File(tmpArchiveDirName);
158 		tmpArchiveDir.mkdir();
159 		try{
160 		XdmNode tei = getTEI(is);
161 		try {
162 			XMLUtils.storeDocument(tei, new File(tmpArchiveDirName
163 					+ File.separator + "tei.xml"));
164 		} catch (IOException ex) {
165 			throw ex;
166 		}
167 
168 		// copy directories
169 		for (String dirName : archiveDirectoriesToCopy) {
170 			File dir = new File(directoryName + File.separator + dirName);
171 			if (dir.exists() && dir.isDirectory()) {
172 				// try to create necessary directories
173 				if (dirName.indexOf('/') != -1
174 						&& !dirName.substring(0, dirName.lastIndexOf('/'))
175 								.equals("")) {
176 					File dirToCreate = new File(tmpArchiveDirName
177 							+ File.separator
178 							+ dirName.substring(0, dirName.lastIndexOf('/')));
179 					if (!dirToCreate.isDirectory())
180 						dirToCreate.mkdirs();
181 				}
182 
183 				// copy directory
184 				try {
185 					FileUtils.copyDir(dir, new File(tmpArchiveDirName
186 							+ File.separator
187 							+ dirName.substring(dirName.lastIndexOf('/'),dirName.length())));
188 				} catch (IOException e) {
189 					e.printStackTrace();
190 				}
191 			}
192 		}
193 		// pack tmp dir to zip and send it to output stream.
194 		zipToStream(os, tmpArchiveDir);
195 		}finally{
196 			EGEIOUtils.deleteDirectory(tmpArchiveDir);
197 		}
198 	}
199 
200 	/*
201 	 * Gets xml TEI document from docx
202 	 */
203 	private XdmNode getTEI(InputStream is) throws FileNotFoundException,
204 			IOException, SaxonApiException {
205 
206 		FileUtils.unzipFile(is, new File(directoryName));
207 		File dxF = new File(directoryName + File.separator + "word"
208 				+ File.separator + "document.xml");
209 		Processor proc = SaxonProcFactory.getProcessor();
210 		net.sf.saxon.s9api.DocumentBuilder builder = proc.newDocumentBuilder();
211 		XdmNode doc = builder.build(dxF);
212 
213 		XsltCompiler comp = proc.newXsltCompiler();
214 		XsltExecutable normalizerExec = comp
215 				.compile(new StreamSource(new File(propertiesProvider
216 						.docx_pp_getStylesheetNormalizeWordStyles())));
217 		XsltExecutable docx2teiExec = comp.compile(new StreamSource(new File(
218 				propertiesProvider.docx_pp_getStylesheetDocx2TEI())));
219 		XsltTransformer normalizer = normalizerExec.load();
220 		XsltTransformer docx2tei = docx2teiExec.load();
221 
222 		// set directory
223 		normalizer.setParameter(new QName("word-directory"),
224 				new XdmAtomicValue(directoryNameURI));
225 		docx2tei.setParameter(new QName("word-directory"), new XdmAtomicValue(
226 				directoryNameURI));
227 
228 		// transform part1
229 		normalizer.setInitialContextNode(doc);
230 		XdmDestination tmpDest = new XdmDestination();
231 		normalizer.setDestination(tmpDest);
232 		normalizer.transform();
233 
234 		// transform part2
235 		XdmDestination result = new XdmDestination();
236 		docx2tei.setInitialContextNode(tmpDest.getXdmNode());
237 		docx2tei.setDestination(result);
238 		docx2tei.transform();
239 
240 		return result.getXdmNode();
241 	}
242 
243 	/**
244 	 * Merges a TEI document back into a docx
245 	 * 
246 	 * @param teiDoc
247 	 */
248 	public void mergeTEI(XdmNode teiDoc) throws SaxonApiException,
249 			FileNotFoundException, IOException {
250 		// prepare transformation
251 		Processor proc = SaxonProcFactory.getProcessor();
252 		XsltCompiler comp = proc.newXsltCompiler();
253 		XsltExecutable toDocXExec = comp.compile(new StreamSource(new File(
254 				propertiesProvider.docx_pp_getStylesheetTEI2Docx())));
255 		XsltTransformer toDocX = toDocXExec.load();
256 
257 		toDocX.setParameter(new QName("word-directory"), new XdmAtomicValue(
258 				directoryNameURI));
259 
260 		// transform and write back to document.xml
261 		File wordDotXMLFile = new File(directoryName + File.separator + "word"
262 				+ File.separator + "document.xml");
263 		Serializer result = new Serializer();
264 		Writer writer = new BufferedWriter(new OutputStreamWriter(
265 				new FileOutputStream(wordDotXMLFile), "UTF-8"));
266 		result.setOutputWriter(writer);
267 		toDocX.setInitialContextNode(teiDoc);
268 		toDocX.setDestination(result);
269 		toDocX.transform();
270 		writer.close();
271 		// remove original core.xml file
272 		File orgCoreFile = new File(directoryName + File.separator + "docProps"
273 				+ File.separator + "core.xml");
274 		orgCoreFile.delete();
275 		// move new core.xml
276 		File newCoreFile = new File(directoryName + File.separator + "docProps"
277 				+ File.separator + "newcore.xml");
278 		newCoreFile.renameTo(orgCoreFile);
279 
280 	}
281 
282 	/**
283 	 * Packs selected directory into streamed zip archive.
284 	 * 
285 	 * @param os
286 	 * @param dir
287 	 * @throws IOException
288 	 */
289 	public void zipToStream(OutputStream os, File dir) throws IOException {
290 		ZipOutputStream zipOs = new ZipOutputStream(
291 				new BufferedOutputStream(os));
292 		EGEIOUtils.constructZip(dir, zipOs, "");
293 		zipOs.close();
294 	}
295 
296 	public void cleanUp() {
297 		// delete temporary dir
298 		EGEIOUtils.deleteDirectory(new File(directoryName));
299 
300 		// delete zip file
301 		if (null != zipFile && zipFile.exists())
302 			zipFile.delete();
303 
304 		// delete archive
305 		if (null != teiArchive && teiArchive.exists())
306 			teiArchive.delete();
307 
308 	}
309 
310 	public String getDirectoryName() {
311 		return directoryName;
312 	}
313 
314 }