package decompounder; import decompounder.CompoundDisambiguator.DisambiguatedHypernym; import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet; import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; /** * * @author Verena Henrich */ public class CompoundDeterminerGN { private GermaNet germaNet; private final Connection connection; private String compoundsWithHyphensFile = "compounds_with_hyphens###.txt"; private String nonCompoundsWithEmptySpaceFile = "non_compounds_with_empty_space###.txt"; private String compoundsDeterminedWithGNFile = "compounds_determined_with_gn"; private String nonCompoundsDeterminedWithGNFile = "non_compounds_determined_with_gn"; private String fileEnding = ""; public CompoundDeterminerGN(GermaNet germaNet, Connection connection) throws IOException { this.germaNet = germaNet; this.connection = connection; // CompoundDeterminer.writeFile(compoundsWithHyphensFile, query, false); } private static final String GET_NOUNS_WITH_HYPHENS_QUERY = "select comp.id, comp.orth_form " + "from lex_unit_table comp, synset_table s " + "where comp.orth_form similar to '%_-_%' " + "and comp.orth_form not like '% %' " + "and s.id = comp.synset_id " + "and s.word_category_id = 1"; private static final String GET_NOUNS_WITH_HYPHENS_FROM_TO_QUERY = "select comp.id, comp.orth_form " + "from lex_unit_table comp, synset_table s " + "where comp.orth_form similar to '%_-_%' " + "and comp.orth_form not like '% %' " + "and s.id = comp.synset_id " + "and comp.id between ? and ? " + "and s.word_category_id = 1"; private static final String GET_NOUNS_WITH_EMPTY_SPACE_QUERY = "select comp.id, comp.orth_form " + "from lex_unit_table comp, synset_table s " + "where comp.orth_form similar to '% %' " // + "and comp.orth_form not like '% %' " + "and s.id = comp.synset_id " + "and s.word_category_id = 1 " + "ORDER BY comp.id"; private static final String GET_NOUNS_WITH_EMPTY_SPACE_FROM_TO_QUERY = "select comp.id, comp.orth_form " + "from lex_unit_table comp, synset_table s " + "where comp.orth_form similar to '% %' " // + "and comp.orth_form not like '% %' " + "and s.id = comp.synset_id " + "and comp.id between ? and ? " + "and s.word_category_id = 1 " + "ORDER BY comp.id"; private static final String GET_POTENTIAL_MODIFIERS_QUERY = "select distinct l_modifier.orth_form " + "from lex_unit_table l_modifier " + "where length(l_modifier.orth_form) > 1 " + "and (? like lower(l_modifier.orth_form) || '%__' " + "or (? like substring(l_modifier.orth_form from 0 for length(l_modifier.orth_form)) || '__%' " + "and substring(l_modifier.orth_form from length(l_modifier.orth_form)) like 'e')) " + "and substring(l_modifier.orth_form from 2) not similar to '[A-Z]%'"; private static final String GET_POTENTIAL_HEADS_QUERY = "select distinct l_head.orth_form " + "from lex_unit_table l_head " + "where length(l_head.orth_form) > 2 " // + "and l_head.orth_form not like 'in' " // + "and l_head.orth_form not like 'Ei' " // + "and l_head.orth_form not like 'Rei' " // + "and l_head.orth_form not like 'Re' " + "and ? like '__%' || lower(l_head.orth_form) " + "and substring(l_head.orth_form from 1 for 1) similar to '[A-ZÄÖÜ]' " + "and substring(l_head.orth_form from 2) not similar to '[A-ZÄÖÜ]%'"; public void processNonCompoundsWithEmptySpace() throws SQLException, IOException { CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", ""), "", false); PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_EMPTY_SPACE_QUERY); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { int id = results.getInt("id"); // LexUnit noun = germaNet.getLexUnitByID(id); String orthForm = results.getString("orth_form"); CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", ""), "np\t\t" + orthForm + "\t" + id + "\t\\N\t\t\t\\N\t\t\tGN: orth form with empty space (i.e., no compound)" + "\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n", true); } results.close(); } public void processNonCompoundsWithEmptySpace(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException { CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), "", false); PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_EMPTY_SPACE_FROM_TO_QUERY); findNounsStatement.setInt(1, fromLexUnitId); findNounsStatement.setInt(2, toLexUnitId); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { int id = results.getInt("id"); String orthForm = results.getString("orth_form"); CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), "np\t\t" + orthForm + "\t" + id + "\t\\N\t\t\t\\N\t\t\tGN: orth form with empty space (i.e., no compound)" + "\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n", true); } results.close(); } public void processCompoundsWithHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException { CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), "", false); PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_HYPHENS_FROM_TO_QUERY); findNounsStatement.setInt(1, fromLexUnitId); findNounsStatement.setInt(2, toLexUnitId); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { int compoundId = results.getInt("id"); LexUnit compound = germaNet.getLexUnitByID(compoundId); String compoundOrthForm = results.getString("orth_form"); String modifier = compoundOrthForm.substring(0, compoundOrthForm.lastIndexOf("-")); String head = compoundOrthForm.substring(compoundOrthForm.lastIndexOf("-") + 1); int modifierId = -1; int headId = -1; boolean modifierIsInGN = true; boolean headIsInGN = true; // extract modifier List lexUnits; if (modifier.length() > 1) { lexUnits = germaNet.getLexUnits(modifier, false); if (lexUnits.size() < 1) { String changedModifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1); // System.out.println("changedModifier=" + changedModifier); lexUnits = germaNet.getLexUnits(changedModifier, false); if (lexUnits.size() == 1) { modifier = changedModifier; modifierId = lexUnits.get(0).getId(); // System.out.println("changedmodifier=" + changedModifier); } else { modifierIsInGN = false; // System.out.println("modifier " + modifier + " not in GN"); } } else if (lexUnits.size() == 1) { modifierId = lexUnits.get(0).getId(); } } else { modifierIsInGN = false; } // extract head lexUnits = germaNet.getLexUnits(head, false); if (lexUnits.size() < 1) { String changedHead = head.substring(0, 1).toLowerCase() + head.substring(1); // System.out.println("changedhead=" + changedHead); lexUnits = germaNet.getLexUnits(changedHead, false); if (lexUnits.size() == 1) { head = changedHead; headId = lexUnits.get(0).getId(); // System.out.println("changedhead=" + changedHead); } else { headIsInGN = false; // System.out.println("head " + head + " not in GN"); } } else if (lexUnits.size() == 1) { headId = lexUnits.get(0).getId(); } int modifierHypernymDistance = -1; DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet); if (modifierHypernym != null) { modifierHypernymDistance = modifierHypernym.getDistance(); } int headHypernymDistance = -1; DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet); if (headHypernym != null) { headHypernymDistance = headHypernym.getDistance(); } CompoundDBEntry compoundWithHyphen = new CompoundDBEntry(compoundOrthForm, modifier, modifierId, head, headId, compoundId, "GN: compound with hypen(s)", modifierIsInGN, headIsInGN, modifierHypernymDistance, headHypernymDistance); CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), compoundWithHyphen.toSQLString() + "\n", true); } results.close(); } public void processCompoundsWithHyphens() throws SQLException, IOException { CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", ""), "", false); PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_HYPHENS_QUERY); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { int compoundId = results.getInt("id"); LexUnit compound = germaNet.getLexUnitByID(compoundId); String compoundOrthForm = results.getString("orth_form"); String modifier = compoundOrthForm.substring(0, compoundOrthForm.lastIndexOf("-")); String head = compoundOrthForm.substring(compoundOrthForm.lastIndexOf("-") + 1); int modifierId = -1; int headId = -1; boolean modifierIsInGN = true; boolean headIsInGN = true; // extract modifier List lexUnits; if (modifier.length() > 1) { lexUnits = germaNet.getLexUnits(modifier, false); if (lexUnits.size() < 1) { String changedModifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1); // System.out.println("changedModifier=" + changedModifier); lexUnits = germaNet.getLexUnits(changedModifier, false); if (lexUnits.size() == 1) { modifier = changedModifier; modifierId = lexUnits.get(0).getId(); // System.out.println("changedmodifier=" + changedModifier); } else { modifierIsInGN = false; // System.out.println("modifier " + modifier + " not in GN"); } } else if (lexUnits.size() == 1) { modifierId = lexUnits.get(0).getId(); } } else { modifierIsInGN = false; } // extract head lexUnits = germaNet.getLexUnits(head, false); if (lexUnits.size() < 1) { String changedHead = head.substring(0, 1).toLowerCase() + head.substring(1); // System.out.println("changedhead=" + changedHead); lexUnits = germaNet.getLexUnits(changedHead, false); if (lexUnits.size() == 1) { head = changedHead; headId = lexUnits.get(0).getId(); // System.out.println("changedhead=" + changedHead); } else { headIsInGN = false; // System.out.println("head " + head + " not in GN"); } } else if (lexUnits.size() == 1) { headId = lexUnits.get(0).getId(); } int modifierHypernymDistance = -1; DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet); if (modifierHypernym != null) { modifierHypernymDistance = modifierHypernym.getDistance(); } int headHypernymDistance = -1; DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet); if (headHypernym != null) { headHypernymDistance = headHypernym.getDistance(); } CompoundDBEntry compoundWithHyphen = new CompoundDBEntry(compoundOrthForm, modifier, modifierId, head, headId, compoundId, "GN: compound with hypen(s)", modifierIsInGN, headIsInGN, modifierHypernymDistance, headHypernymDistance); CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", ""), compoundWithHyphen.toSQLString() + "\n", true); } results.close(); } public CompoundDBEntry determineCompoundWithGermaNet(int id, String noun) throws IOException, SQLException { if (noun.endsWith("keit")) { // System.out.println(noun + ": ends with keit - " + noun.substring(0, noun.lastIndexOf("keit"))); if (noun.endsWith("igkeit") && !germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("igkeit"))).isEmpty()) { // System.out.println(noun + ": existing word + igkeit"); return new CompoundDBEntry("", "", "", id, "GN: existing word + \'igkeit\'"); } else if (!germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("keit"))).isEmpty()) { // System.out.println(noun + ": existing word + keit"); return new CompoundDBEntry("", "", "", id, "GN: existing word + \'keit\'"); } else if (!noun.endsWith("fertigkeit") && !noun.endsWith("fähigkeit") && !noun.endsWith("möglichkeit")) { // System.out.println(noun + ": keit"); return new CompoundDBEntry("", "", "", id, "GN: ends with \'keit\'"); } } if (noun.endsWith("heit")) { // System.out.println(noun + ": ends with heit - " + noun.substring(0, noun.lastIndexOf("heit"))); if (!germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("heit"))).isEmpty()) { // System.out.println(noun + ": existing word + heit"); return new CompoundDBEntry("", "", "", id, "GN: existing word + \'heit\'"); } } List modifiers = new ArrayList(); PreparedStatement findStatement = connection.prepareStatement(GET_POTENTIAL_MODIFIERS_QUERY); findStatement.setString(1, noun.toLowerCase()); findStatement.setString(2, noun.toLowerCase()); ResultSet results = findStatement.executeQuery(); while (results.next()) { modifiers.add(results.getString("orth_form")); } results.close(); // System.out.print(noun + ":"); List heads = new ArrayList(); findStatement = connection.prepareStatement(GET_POTENTIAL_HEADS_QUERY); findStatement.setString(1, noun.toLowerCase()); results = findStatement.executeQuery(); while (results.next()) { heads.add(results.getString("orth_form")); // System.out.print(" " + results.getString("orth_form")); } results.close(); // remove head "Werk" in case the noun ends with "Bauwerk" if (heads.contains("Bauwerk") && heads.contains("Werk")) { heads.remove("Werk"); } // System.out.println("\n" + noun); List potentialCompounds_completeMatch = new ArrayList(); List potentialCompounds_completeMatchSmallModifier = new ArrayList(); List potentialCompounds_withModifierE = new ArrayList(); List potentialCompounds_withInterfix = new ArrayList(); // System.out.println("modifiers=" + modifiers); // System.out.println("heads=" + heads); LexUnit compound = germaNet.getLexUnitByID(id); DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, heads, germaNet); LexUnit headSynonym = CompoundDisambiguator.disambiguateSynonym(compound, heads, germaNet); if (headSynonym != null) { // System.out.println(noun + " " + id + ": headSynonym=" + headSynonym.getOrthForm()); } LexUnit headRelated = CompoundDisambiguator.disambiguateRelation(compound, heads, germaNet); if (headRelated != null) { // System.out.println(noun + " " + id + ": headRelated=" + headRelated.getOrthForm()); } LexUnit headPwRelated = CompoundDisambiguator.disambiguatePWRelation(compound, heads, germaNet); if (headPwRelated != null) { // System.out.println(noun + " " + id + ": headPwRelated=" + headPwRelated.getOrthForm()); } else if (headRelated != null) { headPwRelated = headRelated; } List headsToRemove = new ArrayList(); headsToRemove.add("Schaft"); headsToRemove.add("Esse"); headsToRemove.add("Ion"); headsToRemove.add("Sal"); headsToRemove.add("Eid"); headsToRemove.add("Bel"); headsToRemove.add("Ade"); headsToRemove.add("Ale"); headsToRemove.add("Fon"); headsToRemove.add("Ren"); headsToRemove.add("Elle"); headsToRemove.add("Max"); for (String headToRemove : headsToRemove) { if (heads.contains(headToRemove) && !((headHypernym != null && !headHypernym.getHypernym().getOrthForm().equals(headToRemove)) || (headSynonym != null && !headSynonym.getOrthForm().equals(headToRemove)) || (headRelated != null && !headRelated.getOrthForm().equals(headToRemove)) || (headPwRelated != null && !headPwRelated.getOrthForm().equals(headToRemove)))) { // System.out.println(noun + " " + id + ": remove head \"" + headToRemove + "\""); heads.remove(headToRemove); } } DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifiers, germaNet); LexUnit modifierSynonym = CompoundDisambiguator.disambiguateSynonym(compound, modifiers, germaNet); if (modifierSynonym != null) { // System.out.println(noun + " " + id + ": modifierSynonym=" + modifierSynonym.getOrthForm()); } LexUnit modifierRelated = CompoundDisambiguator.disambiguateRelation(compound, modifiers, germaNet); if (modifierRelated != null) { // System.out.println(noun + " " + id + ": modifierRelated=" + modifierRelated.getOrthForm()); } LexUnit modifierPwRelated = CompoundDisambiguator.disambiguatePWRelation(compound, modifiers, germaNet); if (modifierPwRelated != null) { // System.out.println(noun + " " + id + ": modifierPwRelated=" + modifierPwRelated.getOrthForm()); } else if (modifierRelated != null) { modifierPwRelated = modifierRelated; } List modifiersToRemove = new ArrayList(); // modifiersToRemove.add("Ei"); modifiersToRemove.add("Inn"); modifiersToRemove.add("Rei"); modifiersToRemove.add("Aus"); modifiersToRemove.add("Au"); modifiersToRemove.add("Ge"); modifiersToRemove.add("in"); modifiersToRemove.add("Ga"); modifiersToRemove.add("Re"); for (String modifierToRemove : modifiersToRemove) { if (modifiers.contains(modifierToRemove) && !((modifierHypernym != null && modifierHypernym.getHypernym().getOrthForm().equals(modifierToRemove)) || (modifierSynonym != null && modifierSynonym.getOrthForm().equals(modifierToRemove)) || (modifierRelated != null && modifierRelated.getOrthForm().equals(modifierToRemove)) || (modifierPwRelated != null && modifierPwRelated.getOrthForm().equals(modifierToRemove)))) { // System.out.println(noun + " " + id + ": remove modifier \"" + modifierToRemove + "\""); modifiers.remove(modifierToRemove); } } if (modifiers.contains("Weh") && !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Weh")) || (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Weh")) || (modifierRelated != null && !modifierRelated.getOrthForm().equals("Weh")) || (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Weh"))) && (modifiers.contains("Wehr"))) { // System.out.println(noun + " " + id + ": remove modifier \"Weh\""); modifiers.remove("Weh"); } if (modifiers.contains("Tage") && !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Tage")) || (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Tage")) || (modifierRelated != null && !modifierRelated.getOrthForm().equals("Tage")) || (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Tage"))) && (modifiers.contains("Tag"))) { // System.out.println(noun + " " + id + ": remove modifier \"Tage\""); modifiers.remove("Tage"); } if (modifiers.contains("Ei") && !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Ei")) || (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Ei")) || (modifierRelated != null && !modifierRelated.getOrthForm().equals("Ei")) || (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Ei"))) && !(compound.getOrthForm().startsWith("Eier"))) { // System.out.println(noun + " " + id + ": remove modifier \"Ei\""); modifiers.remove("Ei"); } // System.out.println("fuge=" + compound.getOrthForm().length() + " " + modifierHypernym.getHypernym().getOrthForm().length() + " " + headHypernym.getHypernym().getOrthForm().length()); // System.out.println("fuge=" + compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length())); // if (modifierHypernym.getHypernym().getOrthForm().length() + headHypernym.getHypernym().getOrthForm().length() <= compound.getOrthForm().length()) { // System.out.println("fuge=" + compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length(), compound.getOrthForm().length() - headHypernym.getHypernym().getOrthForm().length())); // } if (headPwRelated != null && modifierPwRelated != null && modifierPwRelated.getOrthForm().length() + headPwRelated.getOrthForm().length() <= compound.getOrthForm().length() && compound.getOrthForm().substring(modifierPwRelated.getOrthForm().length(), compound.getOrthForm().length() - headPwRelated.getOrthForm().length()).matches("(e|n|s|er|en|es|ens|)")) { // System.out.println("fuge=" + compound.getOrthForm().substring(headPwRelated.getOrthForm().length(), compound.getOrthForm().length() - modifierPwRelated.getOrthForm().length())); String head = headPwRelated.getOrthForm(); String modifier = modifierPwRelated.getOrthForm(); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head and modifier are part-whole related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head and modifier are part-whole related", germaNet); } else if (headHypernym != null && modifierHypernym != null && modifierHypernym.getHypernym().getOrthForm().length() + headHypernym.getHypernym().getOrthForm().length() <= compound.getOrthForm().length() && compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length(), compound.getOrthForm().length() - headHypernym.getHypernym().getOrthForm().length()).matches("(e|n|s|er|en|es|ens|)")) { // System.out.println(noun + ": headHypernym=" + headHypernym.getOrthForm() // + ", modifierHypernym=" + modifierHypernym.getOrthForm()); String head = headHypernym.getHypernym().getOrthForm(); String modifier = modifierHypernym.getHypernym().getOrthForm(); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head and modifier are hypernyms", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head and modifier are hypernyms", germaNet); } else if(!heads.isEmpty() && !modifiers.isEmpty()) { for (String head : heads) { int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); if (indexOfHead == -1) { indexOfHead = noun.toLowerCase().lastIndexOf(head.toLowerCase()); } } // System.out.println("indexOfHead=" + indexOfHead + ", noun.substring(0, indexOfHead)=" + noun.substring(0, indexOfHead)); if (modifiers.contains(noun.substring(0, indexOfHead))) { String[] potentialCompound = {noun.substring(0, indexOfHead), head}; potentialCompounds_completeMatch.add(potentialCompound); // System.out.println("potentialCompounds_exactMatches: " + noun.substring(0, indexOfHead) + " " + head); } else if (modifiers.contains(noun.substring(0, indexOfHead) + "e")) { String[] potentialCompound = {noun.substring(0, indexOfHead) + "e", head}; potentialCompounds_withModifierE.add(potentialCompound); // System.out.println("potentialCompounds_withModifierE: " + noun.substring(0, indexOfHead) + "e " + head); } else if (modifiers.contains(noun.substring(0, indexOfHead - 1)) && (noun.substring(indexOfHead - 1, indexOfHead).equals("e") || noun.substring(indexOfHead - 1, indexOfHead).equals("n") || noun.substring(indexOfHead - 1, indexOfHead).equals("s"))) { String[] potentialCompound = {noun.substring(0, indexOfHead - 1), head}; potentialCompounds_withInterfix.add(potentialCompound); // System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 1) + " " + head); } else if (modifiers.contains(noun.substring(0, indexOfHead - 2)) && (noun.substring(indexOfHead - 2, indexOfHead).equals("en") || noun.substring(indexOfHead - 2, indexOfHead).equals("er") || noun.substring(indexOfHead - 2, indexOfHead).equals("es"))) { String[] potentialCompound = {noun.substring(0, indexOfHead - 2), head}; potentialCompounds_withInterfix.add(potentialCompound); // System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 2) + " " + head); } else if (indexOfHead > 2 && modifiers.contains(noun.substring(0, indexOfHead - 3)) && noun.substring(indexOfHead - 3, indexOfHead).equals("ens")) { String[] potentialCompound = {noun.substring(0, indexOfHead - 3), head}; potentialCompounds_withInterfix.add(potentialCompound); // System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 3) + " " + head); } else if (modifiers.contains(noun.substring(0, 1).toLowerCase() + noun.substring(1, indexOfHead))) { String[] potentialCompound = {noun.substring(0, 1).toLowerCase() + noun.substring(1, indexOfHead), head}; potentialCompounds_completeMatchSmallModifier.add(potentialCompound); // System.out.println("potentialCompounds_exactMatches: " + noun.substring(0, 1).toLowerCase() + noun.substring(0, indexOfHead) + " " + head); } } if (potentialCompounds_completeMatch.size() == 1) { String modifier = potentialCompounds_completeMatch.get(0)[0]; String head = potentialCompounds_completeMatch.get(0)[1]; CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: 1 complete match", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: 1 complete match", germaNet); } else if (potentialCompounds_completeMatch.size() > 1) { if (headHypernym != null) { String comment = "GN: more complete matches, head is hypernym"; String head = headHypernym.getHypernym().getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatch) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierPwRelated != null) { String comment = "GN: more complete matches, modifier is part-whole related"; String modifier = modifierPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatch) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headPwRelated != null) { String comment = "GN: more complete matches, head is part-whole related"; String head = headPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatch) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierSynonym != null) { String comment = "GN: more complete matches, modifier has synonym"; String modifier = modifierSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatch) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headSynonym != null) { String comment = "GN: more complete matches, head has synonym"; String head = headSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatch) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } String comment = "GN: more complete matches: "; CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding, noun + ": modifiers=" + modifiers + ", heads=" + heads + "\n", true); for (String[] modifier_head : potentialCompounds_completeMatch) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-c-" + modifier_head[1] + "\n\n", true); comment += modifier_head[0] + " " + modifier_head[1] + "; "; } return new CompoundDBEntry("", "", "", id, comment); } else if ((potentialCompounds_withInterfix.size() + potentialCompounds_withModifierE.size()) == 1) { if (potentialCompounds_withInterfix.size() == 1) { String modifier = potentialCompounds_withInterfix.get(0)[0]; String head = potentialCompounds_withInterfix.get(0)[1]; CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: interfix", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: interfix", germaNet); } else { String modifier = potentialCompounds_withModifierE.get(0)[0]; String head = potentialCompounds_withModifierE.get(0)[1]; CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: modifier-e", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: modifier-e", germaNet); } } else if ((potentialCompounds_withInterfix.size() + potentialCompounds_withModifierE.size()) > 1) { if (headHypernym != null) { String comment = "GN: more matches with interfix or modifier-e, head is hypernym"; String head = headHypernym.getHypernym().getOrthForm(); for (String[] modifier_head : potentialCompounds_withInterfix) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } for (String[] modifier_head : potentialCompounds_withModifierE) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierPwRelated != null) { String comment = "GN: more matches with interfix or modifier-e, modifier is part-whole related"; String modifier = modifierPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_withInterfix) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } for (String[] modifier_head : potentialCompounds_withModifierE) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headPwRelated != null) { String comment = "GN: more matches with interfix or modifier-e, head is part-whole related"; String head = headPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_withInterfix) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } for (String[] modifier_head : potentialCompounds_withModifierE) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierSynonym != null) { String comment = "GN: more matches with interfix or modifier-e, modifier has synonym"; String modifier = modifierSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_withInterfix) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } for (String[] modifier_head : potentialCompounds_withModifierE) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headSynonym != null) { String comment = "GN: more matches with interfix or modifier-e, head has synonym"; String head = headSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_withInterfix) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } for (String[] modifier_head : potentialCompounds_withModifierE) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } String comment = "GN: more matches with interfix or modifier-e: "; for (String[] modifier_head : potentialCompounds_withInterfix) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-i-" + modifier_head[1] + "\n\n", true); comment += modifier_head[0] + " " + modifier_head[1] + "; "; } for (String[] modifier_head : potentialCompounds_withModifierE) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-e-" + modifier_head[1] + "\n\n", true); comment += modifier_head[0] + " " + modifier_head[1] + "; "; } return new CompoundDBEntry("", "", "", id, comment); } else if (potentialCompounds_completeMatchSmallModifier.size() > 0) { if (potentialCompounds_completeMatchSmallModifier.size() == 1) { String modifier = potentialCompounds_completeMatchSmallModifier.get(0)[0]; String head = potentialCompounds_completeMatchSmallModifier.get(0)[1]; CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: 1 complete match with modifier small case", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: complete match with modifier small case", germaNet); } if (headHypernym != null) { String comment = "GN: more complete matches with modifier small case, head is hypernym"; String head = headHypernym.getHypernym().getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierPwRelated != null) { String comment = "GN: more complete matches with modifier small case, modifier is part-whole related"; String modifier = modifierPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headPwRelated != null) { String comment = "GN: more complete matches with modifier small case, head is part-whole related"; String head = headPwRelated.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } if (modifierSynonym != null) { String comment = "GN: more complete matches with modifier small case, modifier has synonym"; String modifier = modifierSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { if (modifier_head[0].equals(modifier)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet); } } } if (headSynonym != null) { String comment = "GN: more complete matches with modifier small case, head has synonym"; String head = headSynonym.getOrthForm(); for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { if (modifier_head[1].equals(head)) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet); } } } String comment = "GN: more complete matches with modifier small case: "; for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-s-" + modifier_head[1] + "\n\n", true); comment += modifier_head[0] + " " + modifier_head[1] + "; "; } return new CompoundDBEntry("", "", "", id, comment); // } else if (modifierSynonym != null) { // String modifier = modifierSynonym.getOrthForm(); // String head = noun.substring(modifier.length(), noun.length()); // // for (int cut = 0; cut < 3; cut++) { // // head with first letter upper case // head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1); // if (!germaNet.getLexUnits(head).isEmpty()) { // break; // } // // // head all small // head = head.substring(cut); // if (!germaNet.getLexUnits(head).isEmpty()) { // break; // } // // // in case no match yet, use original split with first letter upper case // head = noun.substring(modifier.length(), noun.length()); // head = head.substring(0, 1).toUpperCase() + head.substring(1); // } // System.out.println("head=" + head); // // if (!head.equals("Schaft")) { // CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, // (new CompoundDBEntry(modifier, head, id, // "GN: modifier has synonym", germaNet)).toSQLString() + "\n", true); // return new CompoundDBEntry(modifier, head, id, // "GN: modifier has synonym", germaNet); // } } else if (modifierPwRelated != null) { String modifier = modifierPwRelated.getOrthForm(); String head = noun.substring(modifier.length(), noun.length()); for (int cut = 0; cut < 3; cut++) { // head with first letter upper case head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // head all small head = head.substring(cut); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // in case no match yet, use original split with first letter upper case head = noun.substring(modifier.length(), noun.length()); head = head.substring(0, 1).toUpperCase() + head.substring(1); } // System.out.println("head=" + head); if (!head.equals("Schaft")) { CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: modifier is part-whole related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: modifier is part-whole related", germaNet); } } else if (headSynonym != null) { String head = headSynonym.getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head has synonym", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head has synonym", germaNet); } else if (headPwRelated != null) { String head = headPwRelated.getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head is part-whole related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head is part-whole related", germaNet); } else if (headHypernym != null) { String head = headHypernym.getHypernym().getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head is hypernym", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head is hypernym", germaNet); } else { CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding, noun + ": modifiers=" + modifiers + ", heads=" + heads + " (no match)\n", true); } } else if (headPwRelated != null) { String head = headPwRelated.getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); for (int cut = 1; cut < 4; cut++) { if (modifier.length() > cut && modifiers.contains(modifier.substring(0, modifier.length() - cut))) { modifier = modifier.substring(0, modifier.length() - cut); break; } } CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head is part-whole related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head is part-whole related", germaNet); } else if (modifierPwRelated != null) { String modifier = modifierPwRelated.getOrthForm(); String head = noun.substring(modifier.length(), noun.length()); for (int cut = 0; cut < 3; cut++) { // head with first letter upper case head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // head all small head = head.substring(cut); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // in case no match yet, use original split with first letter upper case head = noun.substring(modifier.length(), noun.length()); head = head.substring(0, 1).toUpperCase() + head.substring(1); } // System.out.println("head=" + head); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: modifier is part-whole related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: modifier is part-whole related", germaNet); } else if (headRelated != null) { String head = headRelated.getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); for (int cut = 1; cut < 4; cut++) { if (modifier.length() > cut && modifiers.contains(modifier.substring(0, modifier.length() - cut))) { modifier = modifier.substring(0, modifier.length() - cut); break; } } CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head is related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head is related", germaNet); } else if (modifierRelated != null) { String modifier = modifierRelated.getOrthForm(); String head = noun.substring(modifier.length(), noun.length()); for (int cut = 0; cut < 3; cut++) { // head with first letter upper case head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // head all small head = head.substring(cut); if (!germaNet.getLexUnits(head).isEmpty()) { break; } // in case no match yet, use original split with first letter upper case head = noun.substring(modifier.length(), noun.length()); head = head.substring(0, 1).toUpperCase() + head.substring(1); } // System.out.println("head=" + head); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: modifier is related", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: modifiers is related", germaNet); } else if (headHypernym != null) { String head = headHypernym.getHypernym().getOrthForm(); int indexOfHead = noun.lastIndexOf(head.toLowerCase()); if (indexOfHead == -1) { indexOfHead = noun.lastIndexOf(head); } String modifier = noun.substring(0, indexOfHead); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: head is hypernym", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: head is hypernym", germaNet); } else if (modifierHypernym != null) { String modifier = modifierHypernym.getHypernym().getOrthForm(); String head = noun.substring(modifier.length()); CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, (new CompoundDBEntry(modifier, head, id, "GN: modifier is hypernym", germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, "GN: modifier is hypernym", germaNet); } else { CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding, noun + ": modifiers=" + modifiers + ", heads=" + heads + "\n", true); } return new CompoundDBEntry("", "", "", id, ""); } public void setFileEnding(String fileEnding) throws IOException { this.fileEnding = fileEnding; CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "", false); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding, "", false); } }