package decompounder; import de.uni_leipzig.asv.toolbox.baseforms.Zerleger2; import decompounder.CompoundDisambiguator.DisambiguatedHypernym; import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet; import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * * @author Verena Henrich */ public class CompoundDeterminerASV { private Zerleger2 zerleger = new Zerleger2(); private GermaNet germaNet; // private String logFile = "log.txt"; private String compoundsDeterminedWithASVFile = "compounds_determined_with_asv"; private String nonCompoundsDeterminedWithASVFile = "non_compounds_determined_with_asv"; private String fileEnding = ""; public CompoundDeterminerASV(GermaNet germaNet) throws IOException { this.germaNet = germaNet; String red = "src/main/resources/ASV_trees/grfExt.tree"; // reduce file for splitting String forw = "src/main/resources/ASV_trees/kompVVic.tree"; // forward file String back = "src/main/resources/ASV_trees/kompVHic.tree"; // backward file zerleger = new Zerleger2(); // splitter zerleger.init(forw, back, red); } public CompoundDBEntry determineCompoundWithASV(int id, String noun) throws IOException { List splitted = zerleger.kZerlegung(noun); // System.out.println(noun + " " + splitted); if (splitted.contains(";")) { // System.err.println(noun + " contains \";\" " + splitted); } while (splitted.contains("")) { // System.err.println(noun + " contains empty entry " + splitted); splitted.remove(""); } if (splitted.size() > 1 && splitted.get(splitted.size()-1).equalsIgnoreCase("schaft")) { splitted.remove(splitted.size()-1); splitted.set(splitted.size()-1, splitted.get(splitted.size()-1) + "schaft"); // System.out.println(noun + " splitted without 'schaft'"); } if (splitted.size() > 2 && splitted.get(splitted.size()-1).equalsIgnoreCase("werk") && splitted.get(splitted.size()-2).equalsIgnoreCase("bau")) { splitted.remove(splitted.size()-1); splitted.set(splitted.size()-1, "bauwerk"); // System.out.println(noun + " splitted without 'schaft'"); } if (splitted.size() < 2) { // if noun is not a compound CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": is no compound\n", true); } else if (splitted.size() == 2) { // if noun is a binary compound // System.out.println(noun + "\t" + splitted); // compound as proposed by compound splitter String modifier = splitted.get(0); String head = splitted.get(1); if (modifier.contains(";") && head.contains(";")) { // System.err.println(noun + ": modifier and noun contain \";\""); } if (modifier.contains(";")) { // System.out.println(modifier + "-->" + noun.substring(0, noun.length() - head.length())); modifier = noun.substring(0, noun.length() - head.length()); } if (head.contains(";")) { // System.out.println(head + "-->" + noun.substring(modifier.length())); head = noun.substring(modifier.length() + 1); } CompoundDBEntry compound = createCompoundDBEntry(modifier, head, id); if (compound.isModifierInGermaNet() && compound.isHeadInGermaNet()) { compound.setComment("ASV: 2 constituents, both parts in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true); return compound; } else if (compound.isHeadInGermaNet()) { compound.setComment("ASV: 2 constituents, head in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true); return compound; } CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " is not correctly splitted: " + splitted + "\n", true); return new CompoundDBEntry("", "", "", id, "ASV: is not correctly splitted: " + splitted); } else if (splitted.size() == 3) { // if noun consists of 3 constituents // System.out.println(noun + "\t" + splitted); // modifier is first part by compound splitter, // head starts with second part by compound splitter String modifier = splitted.get(0); // System.out.print(" modifier=" + modifier); String constituent2_withoutLastChar = splitted.get(1).substring(0, splitted.get(1).length()-1); // System.out.print(", c=" + constituent2_withoutLastChar); String head = noun.substring(modifier.length()); if (noun.indexOf(constituent2_withoutLastChar, modifier.length()-1) != -1) { head = noun.substring(noun.indexOf(constituent2_withoutLastChar, modifier.length()-1)); } CompoundDBEntry compound1 = createCompoundDBEntry(modifier, head, id); // System.out.println(", head=" + head); head = splitted.get(2); if (noun.indexOf(splitted.get(1)) != -1) { modifier = noun.substring(0, noun.indexOf(splitted.get(1)) + splitted.get(1).length()); } else { modifier = noun.substring(0, noun.length() - head.length()); } // System.out.println(splitted + " modifier2=" + modifier + ", head2=" + head); CompoundDBEntry compound2 = createCompoundDBEntry(modifier, head, id); // CompoundDeterminer.writeFile(logFile, "2: " + compound2.getModifier() + " " + compound2.getHypernym() + // " (isInGN=" + compound2.isModifierInGermaNet() // + ", headIsInGN=" + compound2.isHeadInGermaNet() + ")\n", true); if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet() && compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) { if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) { compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 > headDistance2)"); return compound1; } else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) { compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 < headDistance2)"); return compound2; } else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) { compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 > modifierDistance2)"); return compound1; } else if (compound1.getModifierHypernymDistance() < compound2.getModifierHypernymDistance()) { compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 < modifierDistance2)"); return compound2; } CompoundDeterminer.twoBinaryCompoundsAreInGN++; CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds are in GN\n", true); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true); } else if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet()) { // ist dieses IF doppelt? compound1.setComment("ASV: 3 constituents, both parts in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true); return compound1; } else if (compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) { compound2.setComment("ASV: 3 constituents, both parts in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true); return compound2; } else if (compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) { if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) { return compound1; } else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) { return compound2; } CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds whose heads are in GN\n", true); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true); } else if (compound1.isHeadInGermaNet()) { compound1.setComment("ASV: 3 constituents, head in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true); return compound1; } else if (compound2.isHeadInGermaNet()) { compound2.setComment("ASV: 3 constituents, head in GN"); CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true); return compound2; } else { CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " not correctly splitted: " + splitted + "\n", true); return new CompoundDBEntry("", "", "", id, "ASV: not correctly splitted: " + splitted); } } else if (splitted.size() > 3) { // System.out.println(noun + "\t" + splitted); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " has more than 3 constituents: " + splitted + "\n", true); return new CompoundDBEntry("", "", "", id, "ASV: has more than 3 constituents: " + splitted); } return new CompoundDBEntry("", "", "", id, ""); } private CompoundDBEntry createCompoundDBEntry(String modifier, String head, int compoundId) { int modifierId = -1; int headId = -1; String comment = "ASV"; boolean modifierIsInGermaNet = true; boolean headIsInGermaNet = true; int modifierHypernymDistance = -1; int headHypernymDistance = -1; String compositionalType = "\\N"; LexUnit compound = germaNet.getLexUnitByID(compoundId); DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet); List lexUnits = new ArrayList(); if (modifierHypernym == null) { lexUnits = germaNet.getLexUnits(modifier); if (lexUnits.isEmpty()) { // try adding modifier-e modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier + "e", germaNet); if (modifierHypernym == null) { lexUnits = germaNet.getLexUnits(modifier + "e"); } if (modifierHypernym != null || lexUnits.size() > 0) { modifier += "e"; compositionalType = CompoundDBEntry.COMPOSITIONAL_TYPE_MODIFIER_E; } else { // try first letter of modifier in lower case modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier.substring(0, 1).toLowerCase() + modifier.substring(1), germaNet); if (modifierHypernym == null) { lexUnits = germaNet.getLexUnits(modifier.substring(0, 1).toLowerCase() + modifier.substring(1)); } if (modifierHypernym != null || lexUnits.size() > 0) { modifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1); } } } } if (modifierHypernym == null && lexUnits.isEmpty()) { modifierIsInGermaNet = false; } else if (modifierHypernym != null) { modifierId = modifierHypernym.getHypernym().getId(); modifierHypernymDistance = modifierHypernym.getDistance(); } else if (lexUnits.size() == 1) { modifierId = lexUnits.get(0).getId(); } // try first character of head in upper case DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head.substring(0, 1).toUpperCase() + head.substring(1), germaNet); lexUnits = new ArrayList(); if (headHypernym == null) { lexUnits = germaNet.getLexUnits(head.substring(0, 1).toUpperCase() + head.substring(1)); if (lexUnits.isEmpty()) { // try as it was headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet); if (headHypernym == null) { lexUnits = germaNet.getLexUnits(head); } } else { head = head.substring(0, 1).toUpperCase() + head.substring(1); } } else { head = head.substring(0, 1).toUpperCase() + head.substring(1); } if (headHypernym == null && lexUnits.isEmpty()) { headIsInGermaNet = false; } if (headHypernym != null) { headId = headHypernym.getHypernym().getId(); headHypernymDistance = headHypernym.getDistance(); } else if (lexUnits.size() == 1) { headId = lexUnits.get(0).getId(); } return new CompoundDBEntry(compound.getOrthForm(), modifier, modifierId, head, headId, compoundId, comment, modifierIsInGermaNet, headIsInGermaNet, modifierHypernymDistance, headHypernymDistance); } public void setFileEnding(String fileEnding) throws IOException { this.fileEnding = fileEnding; CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, "", false); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, "", false); } }