001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 import org.maltparser.core.io.dataformat.ColumnDescription; 016 import org.maltparser.core.io.dataformat.DataFormatException; 017 import org.maltparser.core.io.dataformat.DataFormatInstance; 018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 019 import org.maltparser.core.syntaxgraph.PhraseStructure; 020 import org.maltparser.core.syntaxgraph.TokenStructure; 021 import org.maltparser.core.syntaxgraph.edge.Edge; 022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 023 import org.maltparser.core.syntaxgraph.node.TokenNode; 024 /** 025 * 026 * 027 * @author Johan Hall 028 */ 029 public class BracketReader implements SyntaxGraphReader { 030 private BufferedReader reader; 031 private DataFormatInstance dataFormatInstance; 032 private int sentenceCount; 033 private StringBuilder input; 034 private int terminalCounter; 035 private int nonTerminalCounter; 036 private String optionString; 037 private SortedMap<String,ColumnDescription> inputColumns; 038 private SortedMap<String,ColumnDescription> edgeLabelColumns; 039 private SortedMap<String,ColumnDescription> phraseLabelColumns; 040 041 private String fileName = null; 042 private URL url = null; 043 private String charsetName; 044 private int nIterations; 045 private int cIterations; 046 private boolean closeStream = true; 047 048 private char STARTING_BRACKET = '('; 049 private char CLOSING_BRACKET = ')'; 050 private char INPUT_SEPARATOR = ' '; 051 private char EDGELABEL_SEPARATOR = '-'; 052 private char SENTENCE_SEPARATOR = '\n'; 053 private char BLANK = ' '; 054 private char CARRIAGE_RETURN = '\r'; 055 private char TAB = '\t'; 056 057 public BracketReader() { 058 input = new StringBuilder(); 059 nIterations = 1; 060 cIterations = 1; 061 } 062 063 private void reopen() throws MaltChainedException { 064 close(); 065 if (fileName != null) { 066 open(fileName, charsetName); 067 } else if (url != null) { 068 open(url, charsetName); 069 } else { 070 throw new DataFormatException("The input stream cannot be reopen. "); 071 } 072 } 073 074 public void open(String fileName, String charsetName) throws MaltChainedException { 075 setFileName(fileName); 076 setCharsetName(charsetName); 077 try { 078 open(new FileInputStream(fileName), charsetName); 079 }catch (FileNotFoundException e) { 080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 081 } 082 } 083 public void open(URL url, String charsetName) throws MaltChainedException { 084 setUrl(url); 085 setCharsetName(charsetName); 086 try { 087 open(url.openStream(), charsetName); 088 } catch (IOException e) { 089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 090 } 091 } 092 093 public void open(InputStream is, String charsetName) throws MaltChainedException { 094 try { 095 if (is == System.in) { 096 closeStream = false; 097 } 098 open(new InputStreamReader(is, charsetName)); 099 } catch (UnsupportedEncodingException e) { 100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 101 } 102 } 103 104 private void open(InputStreamReader isr) throws MaltChainedException { 105 setReader(new BufferedReader(isr)); 106 setSentenceCount(0); 107 } 108 109 public void readProlog() throws MaltChainedException { 110 111 } 112 113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 114 if (syntaxGraph == null || dataFormatInstance == null) { 115 return false; 116 } 117 syntaxGraph.clear(); 118 int brackets = 0; 119 try { 120 int l = reader.read(); 121 char c; 122 input.setLength(0); 123 124 while (true) { 125 if (l == -1) { 126 input.setLength(0); 127 return false; 128 } 129 130 c = (char)l; 131 l = reader.read(); 132 133 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { 134 135 } else if (c == STARTING_BRACKET) { 136 input.append(c); 137 brackets++; 138 } else if (c == CLOSING_BRACKET) { 139 input.append(c); 140 brackets--; 141 } else if (c == INPUT_SEPARATOR) { 142 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { 143 input.append(c); 144 } 145 // Start BracketProgLangReader 146 } else if (c == '\\') { 147 c = (char) l; 148 l = reader.read(); 149 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') { 150 System.out.println("Error"); 151 System.exit(1); 152 } else { 153 input.append("\\" + c); 154 } 155 // End BracketProgLangReader 156 } else if (brackets != 0){ 157 input.append(c); 158 } 159 if (brackets == 0 && input.length() != 0) { 160 sentenceCount++; 161 terminalCounter = 1; 162 nonTerminalCounter = 1; 163 if (syntaxGraph instanceof PhraseStructure) { 164 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null); 165 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 166 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 167 } 168 } 169 return true; 170 } 171 172 if (c == -1) { 173 if (brackets != 0) { 174 close(); 175 throw new MaltChainedException("Error when reading from the input file. "); 176 } 177 if (cIterations < nIterations) { 178 cIterations++; 179 reopen(); 180 return true; 181 } 182 return false; 183 } 184 } 185 } catch (IOException e) { 186 close(); 187 throw new MaltChainedException("Error when reading from the input file. ", e); 188 } 189 190 } 191 192 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { 193 int bracketsdepth = 0; 194 int startpos = start-1; 195 for (int i = start, n = end; i < n; i++) { 196 if (input.charAt(i) == STARTING_BRACKET 197 // Start BracketProgLangReader 198 && (i == 0 || input.charAt(i - 1) != '\\') 199 // end BracketProgLangReader 200 201 ) { 202 if (bracketsdepth == 0) { 203 startpos = i; 204 } 205 bracketsdepth++; 206 } else if (input.charAt(i) == CLOSING_BRACKET 207 // Start BracketProgLangReader 208 && (i == 0 || input.charAt(i - 1) != '\\') 209 // end BracketProgLangReader 210 ) { 211 bracketsdepth--; 212 if (bracketsdepth == 0) { 213 extract(phraseStructure, startpos+1, i, parent); 214 } 215 } 216 } 217 } 218 219 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { 220 int index = -1; 221 for (int i = begin; i < end; i++) { 222 if (input.charAt(i) == STARTING_BRACKET 223 // Start BracketProgLangReader 224 && (i == begin || input.charAt(i - 1) != '\\') 225 // end BracketProgLangReader 226 ) { 227 index = i; 228 break; 229 } 230 } 231 if (index == -1) { 232 TokenNode t = phraseStructure.addTokenNode(terminalCounter); 233 if (t == null) { 234 close(); 235 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); 236 } 237 238 terminalCounter++; 239 Edge e = null; 240 241 if (parent != null) { 242 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t); 243 } else { 244 close(); 245 throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); 246 } 247 248 int start = begin; 249 250 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); 251 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 252 boolean noneNode = false; 253 boolean edgeLabels = false; 254 for (int i = begin; i < end; i++) { 255 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 256 // Start BracketProgLangReader 257 && (i == begin || input.charAt(i - 1) != '\\') 258 // end BracketProgLangReader 259 ) || i == end - 1) { 260 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { 261 noneNode = true; 262 } else if (start == begin) { 263 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { 264 if (inputColumnsIterator.hasNext()) { 265 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 266 267 // Start BracketProgLangReader 268 decodeString( 269 // end BracketProgLangReader 270 (i == end - 1)?input.substring(start,end):input.substring(start, i) 271 // Start BracketProgLangReader 272 ) 273 // end BracketProgLangReader 274 ); 275 } 276 start = i + 1; 277 if (input.charAt(i) == EDGELABEL_SEPARATOR) { 278 edgeLabels = true; 279 } 280 } 281 } else if (edgeLabels && e != null) { 282 if (edgeLabelsColumnsIterator.hasNext()) { 283 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 284 } 285 start = i + 1; 286 if (input.charAt(i) == INPUT_SEPARATOR 287 // Start BracketProgLangReader 288 && (i == begin || input.charAt(i - 1) != '\\') 289 // end BracketProgLangReader 290 ) { 291 edgeLabels = false; 292 } 293 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR 294 // Start BracketProgLangReader 295 && (i == begin || input.charAt(i - 1) != '\\') 296 // end BracketProgLangReader 297 ) 298 ) { 299 } else { 300 if (inputColumnsIterator.hasNext()) { 301 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 302 } 303 start = i + 1; 304 } 305 } 306 } 307 } else { 308 PhraseStructureNode nt; 309 Edge e = null; 310 if (parent == null) { 311 nt = phraseStructure.getPhraseStructureRoot(); 312 } else { 313 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); 314 if (nt == null) { 315 close(); 316 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); 317 } 318 nonTerminalCounter++; 319 320 e = phraseStructure.addPhraseStructureEdge(parent, nt); 321 } 322 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); 323 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 324 int newbegin = begin; 325 int start = begin; 326 327 for (int i = begin; i < index; i++) { 328 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { 329 if (start == newbegin) { 330 if (phraseLabelColumnsIterator.hasNext()) { 331 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 332 } 333 start = i + 1; 334 } else if (e != null) { 335 if (edgeLabelsColumnsIterator.hasNext()) { 336 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 337 } 338 start = i + 1; 339 } 340 } else if (input.charAt(i) == BLANK) { 341 start++; 342 newbegin++; 343 } 344 } 345 346 bracketing(phraseStructure, index, end, nt); 347 } 348 } 349 350 private String decodeString(String string) { 351 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " "); 352 } 353 354 public void readEpilog() throws MaltChainedException { 355 356 } 357 358 public BufferedReader getReader() { 359 return reader; 360 } 361 362 public void setReader(BufferedReader reader) { 363 this.reader = reader; 364 } 365 366 public int getSentenceCount() throws MaltChainedException { 367 return sentenceCount; 368 } 369 370 public void setSentenceCount(int sentenceCount) { 371 this.sentenceCount = sentenceCount; 372 } 373 374 public DataFormatInstance getDataFormatInstance() { 375 return dataFormatInstance; 376 } 377 378 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 379 this.dataFormatInstance = inputDataFormatInstance; 380 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 381 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 382 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 383 } 384 385 public String getOptions() { 386 return optionString; 387 } 388 389 public void setOptions(String optionString) throws MaltChainedException { 390 this.optionString = optionString; 391 } 392 393 public String getFileName() { 394 return fileName; 395 } 396 397 public void setFileName(String fileName) { 398 this.fileName = fileName; 399 } 400 401 public URL getUrl() { 402 return url; 403 } 404 405 public void setUrl(URL url) { 406 this.url = url; 407 } 408 409 public String getCharsetName() { 410 return charsetName; 411 } 412 413 public void setCharsetName(String charsetName) { 414 this.charsetName = charsetName; 415 } 416 417 public int getNIterations() { 418 return nIterations; 419 } 420 421 public void setNIterations(int iterations) { 422 nIterations = iterations; 423 } 424 425 public int getIterationCounter() { 426 return cIterations; 427 } 428 429 public void close() throws MaltChainedException { 430 try { 431 if (reader != null) { 432 if (closeStream) { 433 reader.close(); 434 } 435 reader = null; 436 } 437 } catch (IOException e) { 438 throw new DataFormatException("Error when closing the input file.", e); 439 } 440 } 441 }