001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.SortedMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import javax.xml.stream.XMLInputFactory; 015 import javax.xml.stream.XMLStreamConstants; 016 import javax.xml.stream.XMLStreamException; 017 import javax.xml.stream.XMLStreamReader; 018 019 import org.maltparser.core.exception.MaltChainedException; 020 import org.maltparser.core.io.dataformat.DataFormatException; 021 import org.maltparser.core.io.dataformat.DataFormatInstance; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 024 import org.maltparser.core.syntaxgraph.PhraseStructure; 025 import org.maltparser.core.syntaxgraph.SyntaxGraphException; 026 import org.maltparser.core.syntaxgraph.TokenStructure; 027 import org.maltparser.core.syntaxgraph.edge.Edge; 028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 030 031 /** 032 * 033 * 034 * @author Johan Hall 035 */ 036 public class TigerXMLReader implements SyntaxGraphReader { 037 // private TigerXMLHeader header; 038 private XMLStreamReader reader; 039 private int sentenceCount; 040 private DataFormatInstance dataFormatInstance; 041 private StringBuffer ntid; 042 private final StringBuilder graphRootID; 043 // private StringBuilder elementContent; 044 // private StringBuilder valueName; 045 // private StringBuilder currentFeatureName; 046 // private Domain domain; 047 // private boolean collectChar = false; 048 private String optionString; 049 private String fileName = null; 050 private URL url = null; 051 private String charsetName; 052 private int nIterations; 053 private int cIterations; 054 private int START_ID_OF_NONTERMINALS = 500; 055 private boolean closeStream = true; 056 057 public TigerXMLReader() { 058 this.ntid = new StringBuffer(); 059 // elementContent = new StringBuilder(); 060 // valueName = new StringBuilder(); 061 // currentFeatureName = new StringBuilder(); 062 graphRootID = new StringBuilder(); 063 nIterations = 1; 064 cIterations = 1; 065 } 066 067 private void reopen() throws MaltChainedException { 068 close(); 069 if (fileName != null) { 070 open(fileName, charsetName); 071 } else if (url != null) { 072 open(url, charsetName); 073 } else { 074 throw new DataFormatException("The input stream cannot be reopen. "); 075 } 076 } 077 078 public void open(String fileName, String charsetName) throws MaltChainedException { 079 setFileName(fileName); 080 setCharsetName(charsetName); 081 try { 082 open(new FileInputStream(fileName), charsetName); 083 }catch (FileNotFoundException e) { 084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 085 } 086 } 087 public void open(URL url, String charsetName) throws MaltChainedException { 088 setUrl(url); 089 setCharsetName(charsetName); 090 try { 091 open(url.openStream(), charsetName); 092 } catch (IOException e) { 093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 094 } 095 } 096 097 public void open(InputStream is, String charsetName) throws MaltChainedException { 098 try { 099 if (is == System.in) { 100 closeStream = false; 101 } 102 open(new InputStreamReader(is, charsetName)); 103 } catch (UnsupportedEncodingException e) { 104 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 105 } 106 } 107 108 private void open(InputStreamReader isr) throws MaltChainedException { 109 try { 110 XMLInputFactory factory = XMLInputFactory.newInstance(); 111 setReader(factory.createXMLStreamReader(new BufferedReader(isr))); 112 } catch (XMLStreamException e) { 113 throw new DataFormatException("XML input file could be opened. ", e); 114 } 115 setSentenceCount(0); 116 } 117 118 public void readProlog() throws MaltChainedException { 119 120 } 121 122 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 123 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 124 return false; 125 } 126 syntaxGraph.clear(); 127 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 128 PhraseStructureNode parent = null; 129 PhraseStructureNode child = null; 130 // if (header == null) { 131 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables()); 132 // } 133 134 try { 135 while (true) { 136 int event = reader.next(); 137 if (event == XMLStreamConstants.START_ELEMENT) { 138 if (reader.getLocalName().length() == 0) { 139 continue; 140 } 141 if (reader.getLocalName().charAt(0) == 'e') { 142 // e -> edge, edgelabel 143 if (reader.getLocalName().length() == 4) { //edge 144 int childid = -1; 145 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_'); 146 147 try { 148 if (indexSep != -1) { 149 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1)); 150 } else { 151 childid = Integer.parseInt(reader.getAttributeValue(null, "idref")); 152 } 153 if (childid == -1) { 154 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 155 } 156 } catch (NumberFormatException e) { 157 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 158 } 159 160 if (childid < START_ID_OF_NONTERMINALS) { 161 child = phraseStructure.getTokenNode(childid); 162 } else { 163 164 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1); 165 } 166 167 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 168 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 169 for (String name : inputTables.keySet()) { 170 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 171 } 172 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel 173 // domain = Domain.EL; 174 } 175 } else if (reader.getLocalName().charAt(0) == 'n') { 176 // n -> nt, nonterminals, name 177 if (reader.getLocalName().length() == 2) { // nt 178 final String id = reader.getAttributeValue(null, "id"); 179 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) { 180 parent = phraseStructure.getPhraseStructureRoot(); 181 } else { 182 int index = id.indexOf('_'); 183 if (index != -1) { 184 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1); 185 } 186 } 187 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(); 188 for (String name : inputTables.keySet()) { 189 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 190 } 191 } else if (reader.getLocalName().equals("name")) { // name 192 // elementContent.setLength(0); 193 // collectChar = true; 194 } 195 } else if (reader.getLocalName().charAt(0) == 't') { 196 // t -> t, terminals 197 if (reader.getLocalName().length() == 1) { // t 198 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(); 199 child = syntaxGraph.addTokenNode(); 200 for (String name : inputTables.keySet()) { 201 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 202 } 203 } 204 } else if (reader.getLocalName().charAt(0) == 's') { 205 // s -> subcorpus, secedge, s, secedgelabel 206 if (reader.getLocalName().length() == 1) { // s 207 String id = reader.getAttributeValue(null, "id"); 208 boolean indexable = false; 209 int index = -1; 210 if (id != null && id.length() > 0) { 211 for (int i = 0, n = id.length(); i < n; i++) { 212 if (Character.isDigit(id.charAt(i))) { 213 if (index == -1) { 214 index = i; 215 } 216 indexable = true; 217 } 218 } 219 } 220 if (indexable) { 221 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index))); 222 } else { 223 phraseStructure.setSentenceID(sentenceCount+1); 224 } 225 } 226 } else if (reader.getLocalName().charAt(0) == 'v') { 227 // v -> variable, value 228 // if (reader.getLocalName().equals("value")) { 229 // valueName.setLength(0); 230 // valueName.append(reader.getAttributeValue(null, "name")); 231 // elementContent.setLength(0); 232 // collectChar = true; 233 // } 234 } else { 235 // a -> annotation, author 236 // b -> body 237 // c -> corpus 238 // d -> date, description, 239 // f -> feature, format 240 // g -> graph 241 // h -> head, history 242 // m -> matches, match 243 if (reader.getLocalName().equals("graph")) { 244 graphRootID.setLength(0); 245 graphRootID.append(reader.getAttributeValue(null, "root")); 246 } else if (reader.getLocalName().equals("corpus")) { 247 // header.setCorpusID(reader.getAttributeValue(null, "id")); 248 // header.setCorpusID(reader.getAttributeValue(null, "version")); 249 } else if (reader.getLocalName().equals("feature")) { 250 // if (header != null) { 251 // currentFeatureName.setLength(0); 252 // currentFeatureName.append(reader.getAttributeValue(null, "name")); 253 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain")); 254 // } 255 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain")); 256 } else if (reader.getLocalName().equals("secedgelabel")) { 257 // domain = Domain.SEL; 258 } else if (reader.getLocalName().equals("author")) { 259 // elementContent.setLength(0); 260 // collectChar = true; 261 } else if (reader.getLocalName().equals("date")) { 262 // elementContent.setLength(0); 263 // collectChar = true; 264 } else if (reader.getLocalName().equals("description")) { 265 // elementContent.setLength(0); 266 // collectChar = true; 267 } else if (reader.getLocalName().equals("format")) { 268 // elementContent.setLength(0); 269 // collectChar = true; 270 } else if (reader.getLocalName().equals("history")) { 271 // elementContent.setLength(0); 272 // collectChar = true; 273 } 274 } 275 } else if (event == XMLStreamConstants.END_ELEMENT) { 276 if (reader.getLocalName().length() == 0) { 277 continue; 278 } 279 if (reader.getLocalName().charAt(0) == 'e') { 280 // e -> edge, edgelabel 281 } else if (reader.getLocalName().charAt(0) == 'n') { 282 // n -> nt, nonterminals, name 283 if (reader.getLocalName().equals("nt")) { 284 ntid.setLength(0); 285 } 286 else if (reader.getLocalName().equals("nonterminals")) { 287 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) { 288 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1)); 289 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 290 for (String name : inputTables.keySet()) { 291 e.addLabel(inputTables.get(name), "--"); 292 } 293 } 294 } 295 // else if (reader.getLocalName().equals("name")) { 296 // if (header != null) { 297 // header.setMetaName(elementContent.toString()); 298 // } 299 // collectChar = false; 300 // } 301 } else if (reader.getLocalName().charAt(0) == 't') { 302 // t -> t, terminals 303 } else if (reader.getLocalName().charAt(0) == 's') { 304 // s -> subcorpus, secedge, s, secedgelabel 305 if (reader.getLocalName().equals("s")) { 306 if (syntaxGraph.hasTokens()) { 307 sentenceCount++; 308 } 309 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 310 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 311 } 312 return true; 313 } 314 } else if (reader.getLocalName().charAt(0) == 'v') { 315 // v -> variable, value 316 // if (reader.getLocalName().equals("value")) { 317 // if (header != null) { 318 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) { 319 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString()); 320 // } else if (domain == Domain.EL) { 321 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString()); 322 // } else if (domain == Domain.SEL) { 323 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString()); 324 // } 325 // } 326 // collectChar = false; 327 // } 328 } else { 329 // a -> annotation, author 330 // b -> body 331 // c -> corpus 332 // d -> date, description, 333 // f -> feature, format 334 // g -> graph 335 // h -> head, history 336 // m -> matches, match 337 if (reader.getLocalName().equals("body")) { 338 //sentence = dataStructures.getSentence(); 339 //phraseTree = dataStructures.getInPhraseTree(); 340 //sentence.clear(); 341 //phraseTree.clear(); 342 //dataStructures.setLastProcessObject(true); 343 } else if (reader.getLocalName().equals("author")) { 344 // if (header != null) { 345 // header.setMetaAuthor(elementContent.toString()); 346 // } 347 // collectChar = false; 348 } else if (reader.getLocalName().equals("date")) { 349 // if (header != null) { 350 // header.setMetaInDate(elementContent.toString()); 351 // } 352 // collectChar = false; 353 } else if (reader.getLocalName().equals("description")) { 354 // if (header != null) { 355 // header.setMetaDescription(elementContent.toString()); 356 // } 357 // collectChar = false; 358 } else if (reader.getLocalName().equals("format")) { 359 // if (header != null) { 360 // header.setMetaFormat(elementContent.toString()); 361 // } 362 // collectChar = false; 363 } else if (reader.getLocalName().equals("history")) { 364 // if (header != null) { 365 // header.setMetaHistory(elementContent.toString()); 366 // } 367 // collectChar = false; 368 } /* else if (reader.getLocalName().equals("annotation")) { 369 if (header != null) { 370 System.out.println(header.toTigerXML()); 371 } 372 collectChar = false; 373 } */ 374 } 375 } else if (event == XMLStreamConstants.END_DOCUMENT) { 376 if (syntaxGraph.hasTokens()) { 377 sentenceCount++; 378 } 379 if (cIterations < nIterations) { 380 cIterations++; 381 reopen(); 382 return true; 383 } 384 return false; 385 } else if (event == XMLStreamConstants.CHARACTERS) { 386 // if (collectChar) { 387 // char[] ch = reader.getTextCharacters(); 388 // final int size = reader.getTextStart()+reader.getTextLength(); 389 // for (int i = reader.getTextStart(); i < size; i++) { 390 // elementContent.append(ch[i]); 391 // } 392 // } 393 } 394 } 395 } catch (XMLStreamException e) { 396 throw new DataFormatException("", e); 397 } 398 } 399 400 public int getSentenceCount() { 401 return sentenceCount; 402 } 403 404 public void setSentenceCount(int sentenceCount) { 405 this.sentenceCount = sentenceCount; 406 } 407 408 public XMLStreamReader getReader() { 409 return reader; 410 } 411 412 public void setReader(XMLStreamReader reader) { 413 this.reader = reader; 414 } 415 416 public void readEpilog() throws MaltChainedException { 417 418 } 419 420 public void close() throws MaltChainedException { 421 try { 422 if (reader != null) { 423 if (closeStream) { 424 reader.close(); 425 } 426 reader = null; 427 } 428 } catch (XMLStreamException e) { 429 throw new DataFormatException("The XML input file could be closed. ", e); 430 } 431 } 432 433 public DataFormatInstance getDataFormatInstance() { 434 return dataFormatInstance; 435 } 436 437 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 438 this.dataFormatInstance = inputDataFormatInstance; 439 } 440 441 public String getOptions() { 442 return optionString; 443 } 444 445 public void setOptions(String optionString) throws MaltChainedException { 446 this.optionString = optionString; 447 String[] argv; 448 try { 449 argv = optionString.split("[_\\p{Blank}]"); 450 } catch (PatternSyntaxException e) { 451 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e); 452 } 453 for (int i=0; i < argv.length-1; i++) { 454 if(argv[i].charAt(0) != '-') { 455 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 456 } 457 if(++i>=argv.length) { 458 throw new DataFormatException("The last argument does not have any value. "); 459 } 460 switch(argv[i-1].charAt(1)) { 461 case 's': 462 try { 463 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 464 } catch (NumberFormatException e){ 465 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 466 } 467 break; 468 default: 469 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 470 } 471 } 472 } 473 474 public String getFileName() { 475 return fileName; 476 } 477 478 public void setFileName(String fileName) { 479 this.fileName = fileName; 480 } 481 482 public URL getUrl() { 483 return url; 484 } 485 486 public void setUrl(URL url) { 487 this.url = url; 488 } 489 490 public String getCharsetName() { 491 return charsetName; 492 } 493 494 public void setCharsetName(String charsetName) { 495 this.charsetName = charsetName; 496 } 497 498 public int getNIterations() { 499 return nIterations; 500 } 501 502 public void setNIterations(int iterations) { 503 nIterations = iterations; 504 } 505 506 public int getIterationCounter() { 507 return cIterations; 508 } 509 // public TigerXMLHeader getHeader() { 510 // return header; 511 // } 512 // 513 // public void setHeader(TigerXMLHeader header) { 514 // this.header = header; 515 // } 516 }