001 package org.maltparser.ml.liblinear; 002 003 import java.io.BufferedReader; 004 import java.io.BufferedWriter; 005 import java.io.File; 006 import java.io.FileNotFoundException; 007 import java.io.IOException; 008 import java.io.InputStream; 009 import java.io.InputStreamReader; 010 import java.io.OutputStreamWriter; 011 import java.io.PrintStream; 012 import java.util.ArrayList; 013 import java.util.HashMap; 014 import java.util.LinkedHashMap; 015 import java.util.Map; 016 import java.util.Set; 017 import java.util.jar.JarEntry; 018 import java.util.regex.Pattern; 019 import java.util.regex.PatternSyntaxException; 020 021 import liblinear.FeatureNode; 022 import liblinear.Linear; 023 import liblinear.Model; 024 import liblinear.Parameter; 025 import liblinear.Problem; 026 import liblinear.SolverType; 027 028 029 030 import org.maltparser.core.exception.MaltChainedException; 031 import org.maltparser.core.feature.FeatureVector; 032 import org.maltparser.core.feature.function.FeatureFunction; 033 import org.maltparser.core.feature.value.FeatureValue; 034 import org.maltparser.core.feature.value.MultipleFeatureValue; 035 import org.maltparser.core.feature.value.SingleFeatureValue; 036 import org.maltparser.core.helper.NoPrintStream; 037 import org.maltparser.core.syntaxgraph.DependencyStructure; 038 import org.maltparser.ml.LearningMethod; 039 import org.maltparser.parser.DependencyParserConfig; 040 import org.maltparser.parser.guide.instance.InstanceModel; 041 import org.maltparser.parser.history.action.SingleDecision; 042 import org.maltparser.parser.history.kbest.KBestList; 043 import org.maltparser.parser.history.kbest.ScoredKBestList; 044 045 046 public class Liblinear implements LearningMethod { 047 public final static String LIBLINEAR_VERSION = "1.51"; 048 public enum Verbostity { 049 SILENT, ERROR, ALL 050 } 051 private LinkedHashMap<String, String> liblinearOptions; 052 053 protected InstanceModel owner; 054 protected int learnerMode; 055 protected String name; 056 protected int numberOfInstances; 057 protected boolean saveInstanceFiles; 058 protected boolean excludeNullValues; 059 protected String pathExternalLiblinearTrain = null; 060 private int[] cardinalities; 061 /** 062 * Instance output stream writer 063 */ 064 private BufferedWriter instanceOutput = null; 065 /** 066 * Liblinear model object, only used during classification. 067 */ 068 private Model model = null; 069 070 /** 071 * Parameter string 072 */ 073 private String paramString; 074 075 private ArrayList<FeatureNode> xlist = null; 076 077 private Verbostity verbosity; 078 /** 079 * Constructs a Liblinear learner. 080 * 081 * @param owner the guide model owner 082 * @param learnerMode the mode of the learner TRAIN or CLASSIFY 083 */ 084 public Liblinear(InstanceModel owner, Integer learnerMode) throws MaltChainedException { 085 setOwner(owner); 086 setLearningMethodName("liblinear"); 087 setLearnerMode(learnerMode.intValue()); 088 setNumberOfInstances(0); 089 verbosity = Verbostity.SILENT; 090 091 liblinearOptions = new LinkedHashMap<String, String>(); 092 initLiblinearOptions(); 093 parseParameters(getConfiguration().getOptionValue("liblinear", "liblinear_options").toString()); 094 initSpecialParameters(); 095 if (learnerMode == BATCH) { 096 // if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) { 097 // if (pathExternalLiblinearTrain != null) { 098 // owner.getGuide().getConfiguration().getConfigLogger().info(" Learner : Liblinear external "+ getLibLinearOptions() + "\n"); 099 // } else { 100 // owner.getGuide().getConfiguration().getConfigLogger().info(" Learner : Liblinear "+LIBLINEAR_VERSION+" "+ getLibLinearOptions() + "\n"); 101 // } 102 // } 103 instanceOutput = new BufferedWriter(getInstanceOutputStreamWriter(".ins")); 104 } 105 // else { 106 // if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) { 107 // owner.getGuide().getConfiguration().getConfigLogger().info(" Classifier : Liblinear "+LIBLINEAR_VERSION+" "+ getLibLinearOptions()+ "\n"); 108 // } 109 // } 110 } 111 112 113 public void addInstance(SingleDecision decision, FeatureVector featureVector) throws MaltChainedException { 114 if (featureVector == null) { 115 throw new LiblinearException("The feature vector cannot be found"); 116 } else if (decision == null) { 117 throw new LiblinearException("The decision cannot be found"); 118 } 119 try { 120 instanceOutput.write(decision.getDecisionCode()+"\t"); 121 for (int i = 0; i < featureVector.size(); i++) { 122 FeatureValue featureValue = featureVector.get(i).getFeatureValue(); 123 if (excludeNullValues == true && featureValue.isNullValue()) { 124 instanceOutput.write("-1"); 125 } else { 126 if (featureValue instanceof SingleFeatureValue) { 127 instanceOutput.write(((SingleFeatureValue)featureValue).getCode()+""); 128 } else if (featureValue instanceof MultipleFeatureValue) { 129 Set<Integer> values = ((MultipleFeatureValue)featureValue).getCodes(); 130 int j=0; 131 for (Integer value : values) { 132 instanceOutput.write(value.toString()); 133 if (j != values.size()-1) { 134 instanceOutput.write("|"); 135 } 136 j++; 137 } 138 } 139 } 140 if (i != featureVector.size()) { 141 instanceOutput.write('\t'); 142 } 143 } 144 145 instanceOutput.write('\n'); 146 instanceOutput.flush(); 147 increaseNumberOfInstances(); 148 } catch (IOException e) { 149 throw new LiblinearException("The Liblinear learner cannot write to the instance file. ", e); 150 } 151 } 152 153 public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { } 154 155 /* (non-Javadoc) 156 * @see org.maltparser.ml.LearningMethod#noMoreInstances() 157 */ 158 public void noMoreInstances() throws MaltChainedException { 159 closeInstanceWriter(); 160 } 161 162 163 /* (non-Javadoc) 164 * @see org.maltparser.ml.LearningMethod#train(org.maltparser.parser.guide.feature.FeatureVector) 165 */ 166 public void train(FeatureVector featureVector) throws MaltChainedException { 167 if (featureVector == null) { 168 throw new LiblinearException("The feature vector cannot be found. "); 169 } else if (owner == null) { 170 throw new LiblinearException("The parent guide model cannot be found. "); 171 } 172 cardinalities = getCardinalities(featureVector); 173 if (pathExternalLiblinearTrain == null) { 174 try { 175 final Problem problem = readLibLinearProblem(getInstanceInputStreamReader(".ins"), cardinalities); 176 if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) { 177 owner.getGuide().getConfiguration().getConfigLogger().info("Creating Liblinear model "+getFile(".mod").getName()+"\n"); 178 } 179 final PrintStream out = System.out; 180 final PrintStream err = System.err; 181 System.setOut(NoPrintStream.NO_PRINTSTREAM); 182 System.setErr(NoPrintStream.NO_PRINTSTREAM); 183 Linear.saveModel(new File(getFile(".mod").getAbsolutePath()), Linear.train(problem, getLiblinearParameters())); 184 System.setOut(err); 185 System.setOut(out); 186 if (!saveInstanceFiles) { 187 getFile(".ins").delete(); 188 } 189 } catch (OutOfMemoryError e) { 190 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e); 191 } catch (IllegalArgumentException e) { 192 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e); 193 } catch (SecurityException e) { 194 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e); 195 } catch (IOException e) { 196 throw new LiblinearException("The Liblinear learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e); 197 } 198 } else { 199 trainExternal(featureVector); 200 } 201 saveCardinalities(getInstanceOutputStreamWriter(".car"), cardinalities); 202 } 203 204 @Override 205 public double crossValidate(FeatureVector featureVector, int nrOfSplits) 206 throws MaltChainedException { 207 if (featureVector == null) { 208 throw new LiblinearException("The feature vector cannot be found. "); 209 } else if (owner == null) { 210 throw new LiblinearException("The parent guide model cannot be found. "); 211 } 212 213 cardinalities = getCardinalities(featureVector); 214 215 double crossValidationAccuracy = 0.0; 216 217 //if (pathExternalLiblinearTrain == null) { 218 try { 219 final Problem problem = readLibLinearProblem(getInstanceInputStreamReader(".ins"), cardinalities); 220 if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) { 221 owner.getGuide().getConfiguration().getConfigLogger().info("Doing cross validation for model "+ owner.getModelName() + "\n"); 222 } 223 final PrintStream out = System.out; 224 final PrintStream err = System.err; 225 System.setOut(NoPrintStream.NO_PRINTSTREAM); 226 System.setErr(NoPrintStream.NO_PRINTSTREAM); 227 228 int[] target = new int[problem.l]; 229 230 Linear.crossValidation(problem, getLiblinearParameters(), nrOfSplits, target); 231 232 double totalCorrect = 0; 233 for (int i = 0; i < problem.l; i++) 234 if (target[i] == problem.y[i]) ++totalCorrect; 235 236 if(totalCorrect>0) 237 crossValidationAccuracy = 100.0 * totalCorrect / problem.l; 238 239 System.setOut(err); 240 System.setOut(out); 241 //Don't delete the instance file here 242 //if (!saveInstanceFiles) { 243 // getFile(".ins").delete(); 244 //} 245 } catch (OutOfMemoryError e) { 246 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e); 247 } catch (IllegalArgumentException e) { 248 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e); 249 } catch (SecurityException e) { 250 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e); 251 } 252 //} else { 253 // trainExternal(featureVector); 254 //} 255 256 return crossValidationAccuracy; 257 } 258 259 private void trainExternal(FeatureVector featureVector) throws MaltChainedException { 260 try { 261 maltSVMFormat2OriginalSVMFormat(getInstanceInputStreamReader(".ins"), getInstanceOutputStreamWriter(".ins.tmp"), cardinalities); 262 owner.getGuide().getConfiguration().getConfigLogger().info("Creating Liblinear model (external) "+getFile(".mod").getName()); 263 264 final String[] params = getLibLinearParamStringArray(); 265 String[] arrayCommands = new String[params.length+3]; 266 int i = 0; 267 arrayCommands[i++] = pathExternalLiblinearTrain; 268 for (; i <= params.length; i++) { 269 arrayCommands[i] = params[i-1]; 270 } 271 arrayCommands[i++] = getFile(".ins.tmp").getAbsolutePath(); 272 arrayCommands[i++] = getFile(".mod").getAbsolutePath(); 273 274 if (verbosity == Verbostity.ALL) { 275 owner.getGuide().getConfiguration().getConfigLogger().info('\n'); 276 } 277 final Process child = Runtime.getRuntime().exec(arrayCommands); 278 final InputStream in = child.getInputStream(); 279 final InputStream err = child.getErrorStream(); 280 int c; 281 while ((c = in.read()) != -1){ 282 if (verbosity == Verbostity.ALL) { 283 owner.getGuide().getConfiguration().getConfigLogger().info((char)c); 284 } 285 } 286 while ((c = err.read()) != -1){ 287 if (verbosity == Verbostity.ALL || verbosity == Verbostity.ERROR) { 288 owner.getGuide().getConfiguration().getConfigLogger().info((char)c); 289 } 290 } 291 if (child.waitFor() != 0) { 292 owner.getGuide().getConfiguration().getConfigLogger().info(" FAILED ("+child.exitValue()+")"); 293 } 294 in.close(); 295 err.close(); 296 if (!saveInstanceFiles) { 297 getFile(".ins").delete(); 298 getFile(".ins.tmp").delete(); 299 } 300 owner.getGuide().getConfiguration().getConfigLogger().info('\n'); 301 } catch (InterruptedException e) { 302 throw new LiblinearException("Liblinear is interrupted. ", e); 303 } catch (IllegalArgumentException e) { 304 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e); 305 } catch (SecurityException e) { 306 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e); 307 } catch (IOException e) { 308 throw new LiblinearException("The Liblinear learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e); 309 } catch (OutOfMemoryError e) { 310 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e); 311 } 312 } 313 314 private int[] getCardinalities(FeatureVector featureVector) { 315 int[] cardinalities = new int[featureVector.size()]; 316 int i = 0; 317 for (FeatureFunction feature : featureVector) { 318 cardinalities[i++] = feature.getFeatureValue().getCardinality(); 319 } 320 return cardinalities; 321 } 322 323 private void saveCardinalities(OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException { 324 final BufferedWriter out = new BufferedWriter(osw); 325 try { 326 for (int i = 0, n = cardinalities.length; i < n; i++) { 327 out.write(Integer.toString(cardinalities[i])); 328 if (i < n - 1) { 329 out.write(','); 330 } 331 } 332 out.write('\n'); 333 out.close(); 334 } catch (IOException e) { 335 throw new LiblinearException("", e); 336 } 337 } 338 339 private int[] loadCardinalities(InputStreamReader isr) throws MaltChainedException { 340 int[] cardinalities = null; 341 try { 342 final BufferedReader in = new BufferedReader(isr); 343 String line; 344 if ((line = in.readLine()) != null) { 345 String[] items = line.split(","); 346 cardinalities = new int[items.length]; 347 for (int i = 0; i < items.length; i++) { 348 cardinalities[i] = Integer.parseInt(items[i]); 349 } 350 } 351 in.close(); 352 } catch (IOException e) { 353 throw new LiblinearException("", e); 354 } catch (NumberFormatException e) { 355 throw new LiblinearException("", e); 356 } 357 return cardinalities; 358 } 359 360 /* (non-Javadoc) 361 * @see org.maltparser.ml.LearningMethod#moveAllInstances(org.maltparser.ml.LearningMethod, org.maltparser.core.feature.function.FeatureFunction, java.util.ArrayList) 362 */ 363 public void moveAllInstances(LearningMethod method, FeatureFunction divideFeature, ArrayList<Integer> divideFeatureIndexVector) throws MaltChainedException { 364 if (method == null) { 365 throw new LiblinearException("The learning method cannot be found. "); 366 } else if (divideFeature == null) { 367 throw new LiblinearException("The divide feature cannot be found. "); 368 } 369 370 try { 371 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins")); 372 final BufferedWriter out = method.getInstanceWriter(); 373 final StringBuilder sb = new StringBuilder(6); 374 int l = in.read(); 375 char c; 376 int j = 0; 377 378 while(true) { 379 if (l == -1) { 380 sb.setLength(0); 381 break; 382 } 383 c = (char)l; 384 l = in.read(); 385 if (c == '\t') { 386 if (divideFeatureIndexVector.contains(j-1)) { 387 out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())); 388 out.write('\t'); 389 } 390 out.write(sb.toString()); 391 j++; 392 out.write('\t'); 393 sb.setLength(0); 394 } else if (c == '\n') { 395 out.write(sb.toString()); 396 if (divideFeatureIndexVector.contains(j-1)) { 397 out.write('\t'); 398 out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())); 399 } 400 out.write('\n'); 401 sb.setLength(0); 402 method.increaseNumberOfInstances(); 403 this.decreaseNumberOfInstances(); 404 j = 0; 405 } else { 406 sb.append(c); 407 } 408 } 409 in.close(); 410 getFile(".ins").delete(); 411 out.flush(); 412 } catch (SecurityException e) { 413 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e); 414 } catch (NullPointerException e) { 415 throw new LiblinearException("The instance file cannot be found. ", e); 416 } catch (FileNotFoundException e) { 417 throw new LiblinearException("The instance file cannot be found. ", e); 418 } catch (IOException e) { 419 throw new LiblinearException("The Liblinear learner read from the instance file. ", e); 420 } 421 422 } 423 424 /* (non-Javadoc) 425 * @see org.maltparser.ml.LearningMethod#predict(org.maltparser.parser.guide.feature.FeatureVector, org.maltparser.ml.KBestList) 426 */ 427 public boolean predict(FeatureVector featureVector, SingleDecision decision) throws MaltChainedException { 428 429 if (model == null) { 430 try { 431 model = Linear.loadModel(new BufferedReader(getInstanceInputStreamReaderFromConfigFile(".mod"))); 432 } catch (IOException e) { 433 throw new LiblinearException("The model cannot be loaded. ", e); 434 } 435 } 436 437 if (cardinalities == null) { 438 if (getConfigFileEntry(".car") != null) { 439 cardinalities = loadCardinalities(getInstanceInputStreamReaderFromConfigFile(".car")); 440 } else { 441 cardinalities = getCardinalities(featureVector); 442 } 443 } 444 //System.out.println("METHOD PREDICT CARDINALITIES SIZE" + cardinalities.length + " FEATURE VECTOR SIZE " +featureVector.size()); 445 if (xlist == null) { 446 xlist = new ArrayList<FeatureNode>(featureVector.size()); 447 } 448 if (model == null) { 449 throw new LiblinearException("The Liblinear learner cannot predict the next class, because the learning model cannot be found. "); 450 } else if (featureVector == null) { 451 throw new LiblinearException("The Liblinear learner cannot predict the next class, because the feature vector cannot be found. "); 452 } 453 int j = 0; 454 int offset = 1; 455 int i = 0; 456 for (FeatureFunction feature : featureVector) { 457 final FeatureValue featureValue = feature.getFeatureValue(); 458 if (!(excludeNullValues == true && featureValue.isNullValue())) { 459 if (featureValue instanceof SingleFeatureValue) { 460 if (((SingleFeatureValue)featureValue).getCode() < cardinalities[i]) { 461 xlist.add(j++, new FeatureNode(((SingleFeatureValue)featureValue).getCode() + offset, 1)); 462 } 463 } else if (featureValue instanceof MultipleFeatureValue) { 464 for (Integer value : ((MultipleFeatureValue)featureValue).getCodes()) { 465 if (value < cardinalities[i]) { 466 xlist.add(j++, new FeatureNode(value + offset, 1)); 467 } 468 } 469 } 470 } 471 offset += cardinalities[i]; 472 i++; 473 } 474 475 FeatureNode[] xarray = new FeatureNode[j]; 476 for (int k = 0; k < j; k++) { 477 xarray[k] = xlist.get(k); 478 } 479 480 if (decision.getKBestList().getK() == 1) { 481 decision.getKBestList().add(Linear.predict(model, xarray)); 482 } else { 483 liblinear_predict_with_kbestlist(model, xarray, decision.getKBestList()); 484 } 485 486 xlist.clear(); 487 488 return true; 489 } 490 491 492 public void terminate() throws MaltChainedException { 493 closeInstanceWriter(); 494 model = null; 495 xlist = null; 496 owner = null; 497 } 498 499 public BufferedWriter getInstanceWriter() { 500 return instanceOutput; 501 } 502 503 protected void closeInstanceWriter() throws MaltChainedException { 504 try { 505 if (instanceOutput != null) { 506 instanceOutput.flush(); 507 instanceOutput.close(); 508 instanceOutput = null; 509 } 510 } catch (IOException e) { 511 throw new LiblinearException("The Liblinear learner cannot close the instance file. ", e); 512 } 513 } 514 515 516 /** 517 * Returns the parameter string for used for configure Liblinear 518 * 519 * @return the parameter string for used for configure Liblinear 520 */ 521 public String getParamString() { 522 return paramString; 523 } 524 525 public InstanceModel getOwner() { 526 return owner; 527 } 528 529 protected void setOwner(InstanceModel owner) { 530 this.owner = owner; 531 } 532 533 public int getLearnerMode() { 534 return learnerMode; 535 } 536 537 public void setLearnerMode(int learnerMode) throws MaltChainedException { 538 this.learnerMode = learnerMode; 539 } 540 541 public String getLearningMethodName() { 542 return name; 543 } 544 545 /** 546 * Returns the current configuration 547 * 548 * @return the current configuration 549 * @throws MaltChainedException 550 */ 551 public DependencyParserConfig getConfiguration() throws MaltChainedException { 552 return owner.getGuide().getConfiguration(); 553 } 554 555 public int getNumberOfInstances() throws MaltChainedException { 556 if(numberOfInstances!=0) 557 return numberOfInstances; 558 else{ 559 //Do a line count of the instance file and return that 560 561 BufferedReader reader = new BufferedReader( getInstanceInputStreamReader(".ins")); 562 try { 563 while(reader.readLine()!=null){ 564 numberOfInstances++; 565 owner.increaseFrequency(); 566 } 567 568 reader.close(); 569 } catch (IOException e) { 570 throw new MaltChainedException("No instances found in file",e); 571 } 572 573 574 575 return numberOfInstances; 576 577 } 578 } 579 580 public void increaseNumberOfInstances() { 581 numberOfInstances++; 582 owner.increaseFrequency(); 583 } 584 585 public void decreaseNumberOfInstances() { 586 numberOfInstances--; 587 owner.decreaseFrequency(); 588 } 589 590 protected void setNumberOfInstances(int numberOfInstances) { 591 this.numberOfInstances = 0; 592 } 593 594 protected void setLearningMethodName(String name) { 595 this.name = name; 596 } 597 598 protected OutputStreamWriter getInstanceOutputStreamWriter(String suffix) throws MaltChainedException { 599 return getConfiguration().getConfigurationDir().getAppendOutputStreamWriter(owner.getModelName()+getLearningMethodName()+suffix); 600 } 601 602 protected InputStreamReader getInstanceInputStreamReader(String suffix) throws MaltChainedException { 603 return getConfiguration().getConfigurationDir().getInputStreamReader(owner.getModelName()+getLearningMethodName()+suffix); 604 } 605 606 protected InputStreamReader getInstanceInputStreamReaderFromConfigFile(String suffix) throws MaltChainedException { 607 return getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(owner.getModelName()+getLearningMethodName()+suffix); 608 } 609 610 protected File getFile(String suffix) throws MaltChainedException { 611 return getConfiguration().getConfigurationDir().getFile(owner.getModelName()+getLearningMethodName()+suffix); 612 } 613 614 protected JarEntry getConfigFileEntry(String suffix) throws MaltChainedException { 615 return getConfiguration().getConfigurationDir().getConfigFileEntry(owner.getModelName()+getLearningMethodName()+suffix); 616 } 617 /** 618 * Reads an instance file into a svm_problem object according to the Malt-SVM format, which is column fixed format (tab-separated). 619 * 620 * @param isr the instance stream reader for the instance file 621 * @param cardinalities a array containing the number of distinct values for a particular column. 622 * @throws LiblinearException 623 */ 624 public Problem readLibLinearProblem(InputStreamReader isr, int[] cardinalities) throws MaltChainedException { 625 Problem problem = new Problem(); 626 627 628 629 try { 630 final BufferedReader fp = new BufferedReader(isr); 631 int max_index = 0; 632 if (xlist == null) { 633 xlist = new ArrayList<FeatureNode>(); 634 } 635 problem.bias = getBias(); 636 problem.l = getNumberOfInstances(); 637 problem.x = new FeatureNode[problem.l][]; 638 problem.y = new int[problem.l]; 639 int i = 0; 640 final Pattern tabPattern = Pattern.compile("\t"); 641 final Pattern pipePattern = Pattern.compile("\\|"); 642 while(true) { 643 String line = fp.readLine(); 644 645 if(line == null) break; 646 String[] columns = tabPattern.split(line); 647 648 if (columns.length == 0) { 649 continue; 650 } 651 652 int offset = 1; 653 int j = 0; 654 try { 655 problem.y[i] = 656 Integer.parseInt(columns[j]); 657 int p = 0; 658 for(j = 1; j < columns.length; j++) { 659 final String[] items = pipePattern.split(columns[j]); 660 for (int k = 0; k < items.length; k++) { 661 try { 662 if (Integer.parseInt(items[k]) != -1) { 663 xlist.add(p, new FeatureNode(Integer.parseInt(items[k])+offset, 1)); 664 p++; 665 } 666 } catch (NumberFormatException e) { 667 throw new LiblinearException("The instance file contain a non-integer value '"+items[k]+"'", e); 668 } 669 } 670 offset += cardinalities[j-1]; 671 } 672 problem.x[i] = xlist.subList(0, p).toArray(new FeatureNode[0]); 673 if(columns.length > 1) { 674 max_index = Math.max(max_index, problem.x[i][p-1].index); 675 } 676 i++; 677 xlist.clear(); 678 } catch (ArrayIndexOutOfBoundsException e) { 679 throw new LiblinearException("Cannot read from the instance file. ", e); 680 } 681 } 682 fp.close(); 683 problem.n = max_index; 684 if ( problem.bias >= 0 ) { 685 problem.n++; 686 } 687 xlist = null; 688 } catch (IOException e) { 689 throw new LiblinearException("Cannot read from the instance file. ", e); 690 } 691 return problem; 692 } 693 694 protected void initSpecialParameters() throws MaltChainedException { 695 if (getConfiguration().getOptionValue("singlemalt", "null_value") != null && getConfiguration().getOptionValue("singlemalt", "null_value").toString().equalsIgnoreCase("none")) { 696 excludeNullValues = true; 697 } else { 698 excludeNullValues = false; 699 } 700 saveInstanceFiles = ((Boolean)getConfiguration().getOptionValue("liblinear", "save_instance_files")).booleanValue(); 701 702 if (!getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().equals("")) { 703 try { 704 if (!new File(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString()).exists()) { 705 throw new LiblinearException("The path to the external Liblinear trainer 'svm-train' is wrong."); 706 } 707 if (new File(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString()).isDirectory()) { 708 throw new LiblinearException("The option --liblinear-liblinear_external points to a directory, the path should point at the 'train' file or the 'train.exe' file"); 709 } 710 if (!(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().endsWith("train") || getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().endsWith("train.exe"))) { 711 throw new LiblinearException("The option --liblinear-liblinear_external does not specify the path to 'train' file or the 'train.exe' file. "); 712 } 713 pathExternalLiblinearTrain = getConfiguration().getOptionValue("liblinear", "liblinear_external").toString(); 714 } catch (SecurityException e) { 715 throw new LiblinearException("Access denied to the file specified by the option --liblinear-liblinear_external. ", e); 716 } 717 } 718 if (getConfiguration().getOptionValue("liblinear", "verbosity") != null) { 719 verbosity = Verbostity.valueOf(getConfiguration().getOptionValue("liblinear", "verbosity").toString().toUpperCase()); 720 } 721 } 722 723 public String getLibLinearOptions() { 724 StringBuilder sb = new StringBuilder(); 725 for (String key : liblinearOptions.keySet()) { 726 sb.append('-'); 727 sb.append(key); 728 sb.append(' '); 729 sb.append(liblinearOptions.get(key)); 730 sb.append(' '); 731 } 732 return sb.toString(); 733 } 734 735 public void parseParameters(String paramstring) throws MaltChainedException { 736 if (paramstring == null) { 737 return; 738 } 739 final String[] argv; 740 String allowedFlags = "sceB"; 741 try { 742 argv = paramstring.split("[_\\p{Blank}]"); 743 } catch (PatternSyntaxException e) { 744 throw new LiblinearException("Could not split the liblinear-parameter string '"+paramstring+"'. ", e); 745 } 746 for (int i=0; i < argv.length-1; i++) { 747 if(argv[i].charAt(0) != '-') { 748 throw new LiblinearException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 749 } 750 if(++i>=argv.length) { 751 throw new LiblinearException("The last argument does not have any value. "); 752 } 753 try { 754 int index = allowedFlags.indexOf(argv[i-1].charAt(1)); 755 if (index != -1) { 756 liblinearOptions.put(Character.toString(argv[i-1].charAt(1)), argv[i]); 757 } else { 758 throw new LiblinearException("Unknown liblinear parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 759 } 760 } catch (ArrayIndexOutOfBoundsException e) { 761 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e); 762 } catch (NumberFormatException e) { 763 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e); 764 } catch (NullPointerException e) { 765 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e); 766 } 767 } 768 } 769 770 public double getBias() throws MaltChainedException { 771 try { 772 return Double.valueOf(liblinearOptions.get("B")).doubleValue(); 773 } catch (NumberFormatException e) { 774 throw new LiblinearException("The liblinear bias value is not numerical value. ", e); 775 } 776 } 777 778 public Parameter getLiblinearParameters() throws MaltChainedException { 779 Parameter param = new Parameter(SolverType.MCSVM_CS, 0.1, 0.1); 780 String type = liblinearOptions.get("s"); 781 782 if (type.equals("0")) { 783 param.setSolverType(SolverType.L2R_LR); 784 } else if (type.equals("1")) { 785 param.setSolverType(SolverType.L2R_L2LOSS_SVC_DUAL); 786 } else if (type.equals("2")) { 787 param.setSolverType(SolverType.L2R_L2LOSS_SVC); 788 } else if (type.equals("3")) { 789 param.setSolverType(SolverType.L2R_L1LOSS_SVC_DUAL); 790 } else if (type.equals("4")) { 791 param.setSolverType(SolverType.MCSVM_CS); 792 } else if (type.equals("5")) { 793 param.setSolverType(SolverType.L1R_L2LOSS_SVC); 794 } else if (type.equals("6")) { 795 param.setSolverType(SolverType.L1R_LR); 796 } else { 797 throw new LiblinearException("The liblinear type (-s) is not an integer value between 0 and 4. "); 798 } 799 try { 800 param.setC(Double.valueOf(liblinearOptions.get("c")).doubleValue()); 801 } catch (NumberFormatException e) { 802 throw new LiblinearException("The liblinear cost (-c) value is not numerical value. ", e); 803 } 804 try { 805 param.setEps(Double.valueOf(liblinearOptions.get("e")).doubleValue()); 806 } catch (NumberFormatException e) { 807 throw new LiblinearException("The liblinear epsilon (-e) value is not numerical value. ", e); 808 } 809 return param; 810 } 811 812 public void initLiblinearOptions() { 813 liblinearOptions.put("s", "4"); // type = SolverType.L2LOSS_SVM_DUAL (default) 814 liblinearOptions.put("c", "0.1"); // cost = 1 (default) 815 liblinearOptions.put("e", "0.1"); // epsilon = 0.1 (default) 816 liblinearOptions.put("B", "1"); // bias = 1 (default) 817 } 818 819 public String[] getLibLinearParamStringArray() { 820 final ArrayList<String> params = new ArrayList<String>(); 821 822 for (String key : liblinearOptions.keySet()) { 823 params.add("-"+key); params.add(liblinearOptions.get(key)); 824 } 825 return params.toArray(new String[params.size()]); 826 } 827 828 829 public void liblinear_predict_with_kbestlist(Model model, FeatureNode[] x, KBestList kBestList) throws MaltChainedException { 830 int i; 831 final int nr_class = model.getNrClass(); 832 final double[] dec_values = new double[nr_class]; 833 834 Linear.predictValues(model, x, dec_values); 835 final int[] labels = model.getLabels(); 836 int[] predictionList = new int[nr_class]; 837 for(i=0;i<nr_class;i++) { 838 predictionList[i] = labels[i]; 839 } 840 841 double tmpDec; 842 int tmpObj; 843 int lagest; 844 for (i=0;i<nr_class-1;i++) { 845 lagest = i; 846 for (int j=i;j<nr_class;j++) { 847 if (dec_values[j] > dec_values[lagest]) { 848 lagest = j; 849 } 850 } 851 tmpDec = dec_values[lagest]; 852 dec_values[lagest] = dec_values[i]; 853 dec_values[i] = tmpDec; 854 tmpObj = predictionList[lagest]; 855 predictionList[lagest] = predictionList[i]; 856 predictionList[i] = tmpObj; 857 } 858 859 int k = nr_class-1; 860 if (kBestList.getK() != -1) { 861 k = kBestList.getK() - 1; 862 } 863 864 for (i=0; i<nr_class && k >= 0; i++, k--) { 865 if (kBestList instanceof ScoredKBestList) { 866 ((ScoredKBestList)kBestList).add(predictionList[i], (float)dec_values[i]); 867 } else { 868 kBestList.add(predictionList[i]); 869 } 870 871 } 872 } 873 874 /** 875 * Converts the instance file (Malt's own SVM format) into the Liblinear (SVMLight) format. The input instance file is removed (replaced) 876 * by the instance file in the Liblinear (SVMLight) format. If a column contains -1, the value will be removed in destination file. 877 * 878 * @param isr the input stream reader for the source instance file 879 * @param osw the output stream writer for the destination instance file 880 * @param cardinalities a vector containing the number of distinct values for a particular column 881 * @throws LiblinearException 882 */ 883 public static void maltSVMFormat2OriginalSVMFormat(InputStreamReader isr, OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException { 884 try { 885 final BufferedReader in = new BufferedReader(isr); 886 final BufferedWriter out = new BufferedWriter(osw); 887 888 int c; 889 int j = 0; 890 int offset = 1; 891 int code = 0; 892 while(true) { 893 c = in.read(); 894 if (c == -1) { 895 break; 896 } 897 898 if (c == '\t' || c == '|') { 899 if (j == 0) { 900 out.write(Integer.toString(code)); 901 j++; 902 } else { 903 if (code != -1) { 904 out.write(' '); 905 out.write(Integer.toString(code+offset)); 906 out.write(":1"); 907 } 908 if (c == '\t') { 909 offset += cardinalities[j-1]; 910 j++; 911 } 912 } 913 code = 0; 914 } else if (c == '\n') { 915 j = 0; 916 offset = 1; 917 out.write('\n'); 918 code = 0; 919 } else if (c == '-') { 920 code = -1; 921 } else if (code != -1) { 922 if (c > 47 && c < 58) { 923 code = code * 10 + (c-48); 924 } else { 925 throw new LiblinearException("The instance file contain a non-integer value, when converting the Malt SVM format into Liblinear format."); 926 } 927 } 928 } 929 in.close(); 930 out.close(); 931 } catch (IOException e) { 932 throw new LiblinearException("Cannot read from the instance file, when converting the Malt SVM format into Liblinear format. ", e); 933 } 934 } 935 936 protected void finalize() throws Throwable { 937 try { 938 closeInstanceWriter(); 939 } finally { 940 super.finalize(); 941 } 942 } 943 944 /* (non-Javadoc) 945 * @see java.lang.Object#toString() 946 */ 947 public String toString() { 948 final StringBuffer sb = new StringBuffer(); 949 sb.append("\nLiblinear INTERFACE\n"); 950 sb.append(" Liblinear version: "+LIBLINEAR_VERSION+"\n"); 951 sb.append(" Liblinear string: "+paramString+"\n"); 952 953 sb.append(getLibLinearOptions()); 954 return sb.toString(); 955 } 956 957 958 @Override 959 public void divideByFeatureSet( 960 Set<Integer> featureIdsToCreateSeparateBranchesForSet, ArrayList<Integer> divideFeatureIndexVector, String otherId) throws MaltChainedException { 961 962 963 //Create a hash map that maps every feature id to a writer 964 HashMap<Integer, BufferedWriter> featureIdToWriterMap = new HashMap<Integer, BufferedWriter>(); 965 966 for(int element:featureIdsToCreateSeparateBranchesForSet){ 967 968 969 BufferedWriter outputWriter = new BufferedWriter(getConfiguration().getConfigurationDir().getOutputStreamWriter(owner.getModelName().replace('.','_') + element + "." + getLearningMethodName()+".ins")); 970 featureIdToWriterMap.put(element, outputWriter); 971 972 } 973 974 BufferedWriter otherOutputWriter = new BufferedWriter(getConfiguration().getConfigurationDir().getOutputStreamWriter(owner.getModelName().replace('.','_') + otherId + "." + getLearningMethodName()+".ins")); 975 976 977 try { 978 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins")); 979 //every line will be written to a separate file 980 String line = in.readLine(); 981 final Pattern tabPattern = Pattern.compile("\t"); 982 while(line!=null){ 983 984 //Find out which pot the line shall be put in 985 String[] lineArray = tabPattern.split(line); 986 987 int id = new Integer(lineArray[divideFeatureIndexVector.get(0)+1]); 988 989 if(!featureIdToWriterMap.containsKey(id)){ 990 otherOutputWriter.write(line + "\n"); 991 }else 992 featureIdToWriterMap.get(id).write(getLineToWrite(lineArray,divideFeatureIndexVector.get(0)+1)); 993 994 line = in.readLine(); 995 } 996 997 otherOutputWriter.close(); 998 999 in.close(); 1000 1001 for(BufferedWriter writer: featureIdToWriterMap.values()) 1002 writer.close(); 1003 1004 } catch (SecurityException e) { 1005 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e); 1006 } catch (NullPointerException e) { 1007 throw new LiblinearException("The instance file cannot be found. ", e); 1008 } catch (FileNotFoundException e) { 1009 throw new LiblinearException("The instance file cannot be found. ", e); 1010 } catch (IOException e) { 1011 throw new LiblinearException("The Liblinear learner read from the instance file. ", e); 1012 } 1013 1014 1015 1016 } 1017 1018 1019 private String getLineToWrite(String[] lineArray, int excludeIndex) { 1020 StringBuffer buf = new StringBuffer(); 1021 1022 for(int n = 0; n < lineArray.length; n++) 1023 if(n != excludeIndex) 1024 buf.append(lineArray[n] + "\t"); 1025 buf.append("\n"); 1026 return buf.toString(); 1027 } 1028 1029 1030 @Override 1031 public Map<Integer, Integer> createFeatureIdToCountMap( 1032 ArrayList<Integer> divideFeatureIndexVector) throws MaltChainedException{ 1033 1034 HashMap<Integer, Integer> featureIdToCountMap = new HashMap<Integer, Integer>(); 1035 1036 //Go trough the file and count all feature ids in the given column(s) 1037 1038 try { 1039 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins")); 1040 //every line will be written to a separate file 1041 String line = in.readLine(); 1042 final Pattern tabPattern = Pattern.compile("\t"); 1043 while(line!=null){ 1044 1045 //Find out which pot the line shall be put in 1046 String[] lineArray = tabPattern.split(line); 1047 1048 for(int n = 0; n < divideFeatureIndexVector.size(); n++){ 1049 int id = new Integer(lineArray[divideFeatureIndexVector.get(n)+1]); 1050 1051 1052 if (!featureIdToCountMap.containsKey(id)) { 1053 1054 featureIdToCountMap.put(id, 0); 1055 1056 } 1057 1058 int previousCount = featureIdToCountMap.get(id); 1059 1060 featureIdToCountMap.put(id, previousCount + 1); 1061 1062 } 1063 1064 line = in.readLine(); 1065 } 1066 in.close(); 1067 } catch (SecurityException e) { 1068 throw new LiblinearException("The Libsvm learner cannot remove the instance file. ", e); 1069 } catch (NullPointerException e) { 1070 throw new LiblinearException("The instance file cannot be found. ", e); 1071 } catch (FileNotFoundException e) { 1072 throw new LiblinearException("The instance file cannot be found. ", e); 1073 } catch (IOException e) { 1074 throw new LiblinearException("The Liblinear learner read from the instance file. ", e); 1075 } 1076 1077 return featureIdToCountMap; 1078 } 1079 1080 }