001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.SortedMap; 011 import java.util.TreeMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 016 import org.maltparser.core.helper.Util; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 023 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 024 import org.maltparser.core.syntaxgraph.node.TokenNode; 025 import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader; 026 import org.maltparser.ml.libsvm.LibsvmException; 027 /** 028 * 029 * 030 * @author Johan Hall 031 */ 032 public class TigerXMLWriter implements SyntaxGraphWriter { 033 private enum RootHandling { 034 TALBANKEN, NORMAL 035 }; 036 037 private BufferedWriter writer; 038 private DataFormatInstance dataFormatInstance; 039 private String optionString; 040 private int sentenceCount; 041 private TigerXMLHeader header; 042 // private boolean hasWriteTigerXMLHeader = false; 043 private RootHandling rootHandling; 044 private String sentencePrefix = "s"; 045 private StringBuilder sentenceID; 046 private StringBuilder tmpID; 047 private StringBuilder rootID; 048 private int START_ID_OF_NONTERMINALS = 500; 049 private boolean labeledTerminalID; 050 private String VROOT_SYMBOL = "VROOT"; 051 private boolean useVROOT = false; 052 // private String fileName = null; 053 // private String charsetName = null; 054 private boolean closeStream = true; 055 056 public TigerXMLWriter() { 057 sentenceID = new StringBuilder(); 058 tmpID = new StringBuilder(); 059 rootID = new StringBuilder(); 060 labeledTerminalID = false; 061 } 062 063 public void open(String fileName, String charsetName) throws MaltChainedException { 064 try { 065 // this.fileName = fileName; 066 // this.charsetName = charsetName; 067 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 068 } catch (FileNotFoundException e) { 069 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 070 } catch (UnsupportedEncodingException e) { 071 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 072 } 073 } 074 075 public void open(OutputStream os, String charsetName) throws MaltChainedException { 076 try { 077 if (os == System.out || os == System.err) { 078 closeStream = false; 079 } 080 open(new OutputStreamWriter(os, charsetName)); 081 } catch (UnsupportedEncodingException e) { 082 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 083 } 084 } 085 086 private void open(OutputStreamWriter osw) throws MaltChainedException { 087 setWriter(new BufferedWriter(osw)); 088 setSentenceCount(0); 089 } 090 091 public void writeProlog() throws MaltChainedException { 092 // if (fileName == null || charsetName == null) { 093 writeHeader(); 094 // } 095 } 096 097 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 098 if (syntaxGraph == null || dataFormatInstance == null) { 099 return; 100 } 101 if (syntaxGraph.hasTokens()) { 102 sentenceCount++; 103 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 104 try { 105 sentenceID.setLength(0); 106 sentenceID.append(sentencePrefix); 107 if (phraseStructure.getSentenceID() != 0) { 108 sentenceID.append(Integer.toString(phraseStructure.getSentenceID())); 109 } else { 110 sentenceID.append(Integer.toString(sentenceCount)); 111 } 112 writer.write(" <s id=\""); 113 writer.write(sentenceID.toString()); 114 writer.write("\">\n"); 115 116 setRootID(phraseStructure); 117 writer.write(" <graph root=\""); 118 writer.write(rootID.toString()); 119 writer.write("\" "); 120 writer.write("discontinuous=\""); 121 writer.write(Boolean.toString(!phraseStructure.isContinuous())); 122 writer.write("\">\n"); 123 124 writeTerminals(phraseStructure); 125 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) { 126 writeNonTerminals(phraseStructure); 127 } else { 128 writer.write(" <nonterminals/>\n"); 129 } 130 writer.write(" </graph>\n"); 131 writer.write(" </s>\n"); 132 } catch (IOException e) { 133 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 134 } 135 } 136 } 137 138 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException { 139 useVROOT = false; 140 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot(); 141 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 142 if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) { 143 useVROOT = true; 144 break; 145 } 146 } 147 if (useVROOT) { 148 rootID.setLength(0); 149 rootID.append(sentenceID); 150 rootID.append('_'); 151 rootID.append(VROOT_SYMBOL); 152 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) { 153 rootID.setLength(0); 154 rootID.append(sentenceID); 155 rootID.append("_1"); 156 } else { 157 rootID.setLength(0); 158 rootID.append(sentenceID); 159 rootID.append('_'); 160 // if (rootHandling.equals(RootHandling.NORMAL)) { 161 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals())); 162 // } else if (rootHandling.equals(RootHandling.TALBANKEN)) { 163 // rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1)); 164 // } 165 } 166 167 } 168 169 public void writeEpilog() throws MaltChainedException { 170 writeTail(); 171 } 172 173 public BufferedWriter getWriter() { 174 return writer; 175 } 176 177 public void setWriter(BufferedWriter writer) { 178 this.writer = writer; 179 } 180 181 public void close() throws MaltChainedException { 182 try { 183 if (writer != null) { 184 writer.flush(); 185 if (closeStream) { 186 writer.close(); 187 } 188 writer = null; 189 } 190 } catch (IOException e) { 191 throw new DataFormatException("Could not close the output file. ", e); 192 } 193 } 194 195 private void writeHeader() throws MaltChainedException { 196 try { 197 if (header == null) { 198 header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 199 } 200 writer.write(header.toTigerXML()); 201 // hasWriteTigerXMLHeader = true; 202 } catch (IOException e) { 203 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 204 } 205 } 206 207 208 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 209 try { 210 writer.write(" <terminals>\n"); 211 for (int index : phraseStructure.getTokenIndices()) { 212 final PhraseStructureNode t = phraseStructure.getTokenNode(index); 213 writer.write(" <t "); 214 if (!labeledTerminalID) { 215 tmpID.setLength(0); 216 tmpID.append(sentenceID); 217 tmpID.append('_'); 218 tmpID.append(Integer.toString(t.getIndex())); 219 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" "); 220 } 221 222 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) { 223 writer.write(column.getName().toLowerCase()); 224 writer.write("=\""); 225 writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable()))); 226 writer.write("\" "); 227 } 228 writer.write("/>\n"); 229 } 230 writer.write(" </terminals>\n"); 231 } catch (IOException e) { 232 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 233 } 234 } 235 236 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 237 try { 238 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 239 for (int index : phraseStructure.getNonTerminalIndices()) { 240 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 241 } 242 writer.write(" <nonterminals>\n"); 243 boolean done = false; 244 int h = 1; 245 while (!done) { 246 done = true; 247 for (int index : phraseStructure.getNonTerminalIndices()) { 248 if (heights.get(index) == h) { 249 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 250 tmpID.setLength(0); 251 tmpID.append(sentenceID); 252 tmpID.append('_'); 253 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1)); 254 writeNonTerminal(nt, tmpID.toString()); 255 done = false; 256 } 257 } 258 h++; 259 } 260 261 writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString()); 262 writer.write(" </nonterminals>\n"); 263 } catch (IOException e) { 264 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 265 } 266 } 267 268 public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException { 269 try { 270 writer.write(" <nt"); 271 writer.write(" id=\"");writer.write(id);writer.write("\" "); 272 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 273 if (nt.hasLabel(column.getSymbolTable())) { 274 writer.write(column.getName().toLowerCase()); 275 writer.write("="); 276 writer.write("\""); 277 writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable()))); 278 writer.write("\" "); 279 } 280 } 281 writer.write(">\n"); 282 283 for (int i = 0, n = nt.nChildren(); i < n; i++) { 284 PhraseStructureNode child = nt.getChild(i); 285 writer.write(" <edge "); 286 287 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) { 288 if (child.hasParentEdgeLabel(column.getSymbolTable())) { 289 writer.write(column.getName().toLowerCase()); 290 writer.write("=\""); 291 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable()))); 292 writer.write("\" "); 293 } 294 } 295 if (child instanceof TokenNode) { 296 if (!labeledTerminalID) { 297 tmpID.setLength(0); 298 tmpID.append(sentenceID); 299 tmpID.append('_'); 300 tmpID.append(Integer.toString(child.getIndex())); 301 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 302 } else { 303 writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\""); 304 } 305 306 } else { 307 tmpID.setLength(0); 308 tmpID.append(sentenceID); 309 tmpID.append('_'); 310 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1)); 311 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 312 } 313 writer.write(" />\n"); 314 } 315 writer.write(" </nt>\n"); 316 } catch (IOException e) { 317 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 318 } 319 } 320 321 322 private void writeTail() throws MaltChainedException { 323 try { 324 writer.write(" </body>\n"); 325 writer.write("</corpus>\n"); 326 writer.flush(); 327 // if (fileName != null && charsetName != null) { 328 // writer.close(); 329 // writer = null; 330 // BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName)); 331 // if (header == null) { 332 // header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 333 // } 334 // 335 // headerWriter.write(header.toTigerXML()); 336 // headerWriter.flush(); 337 // headerWriter.close(); 338 // } 339 } catch (IOException e) { 340 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 341 } 342 } 343 344 public int getSentenceCount() { 345 return sentenceCount; 346 } 347 348 public void setSentenceCount(int sentenceCount) { 349 this.sentenceCount = sentenceCount; 350 } 351 352 public DataFormatInstance getDataFormatInstance() { 353 return dataFormatInstance; 354 } 355 356 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 357 this.dataFormatInstance = dataFormatInstance; 358 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID")); 359 } 360 361 public String getOptions() { 362 return optionString; 363 } 364 365 public void setOptions(String optionString) throws MaltChainedException { 366 this.optionString = optionString; 367 rootHandling = RootHandling.NORMAL; 368 369 String[] argv; 370 try { 371 argv = optionString.split("[_\\p{Blank}]"); 372 } catch (PatternSyntaxException e) { 373 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e); 374 } 375 for (int i=0; i < argv.length-1; i++) { 376 if(argv[i].charAt(0) != '-') { 377 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 378 } 379 if(++i>=argv.length) { 380 throw new DataFormatException("The last argument does not have any value. "); 381 } 382 switch(argv[i-1].charAt(1)) { 383 case 'r': 384 if (argv[i].equals("n")) { 385 rootHandling = RootHandling.NORMAL; 386 } else if (argv[i].equals("tal")) { 387 rootHandling = RootHandling.TALBANKEN; 388 } 389 break; 390 case 's': 391 try { 392 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 393 } catch (NumberFormatException e){ 394 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. "); 395 } 396 break; 397 case 'v': 398 VROOT_SYMBOL = argv[i]; 399 break; 400 default: 401 throw new LibsvmException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 402 } 403 } 404 } 405 }