package decompounder; import decompounder.CompoundDisambiguator.DisambiguatedHypernym; import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet; import java.io.FileInputStream; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author Verena Henrich */ public class CompoundDeterminerSMOR { private GermaNet germaNet; private final Connection connection; // private String pathToSMORTool = "/Users/vhenrich/unison/arbuckle/Tools_Resources/SMOR/"; private String pathToSMORTool = "./src/main/resources/SMOR/"; private String smorResultsLatin1 = "tmp/smor_results_latin1.out"; private String smorResultsUTF8 = "tmp/smor_results_utf8.out"; private String allNounsInGermaNet = "tmp/all_nouns_in_germanet.txt"; private String compoundsDeterminedWithSMORFile = "compounds_determined_with_smor"; private String nonCompoundsDeterminedWithSMORFile = "non_compounds_determined_with_smor"; private String fileEnding = ""; private Map> nouns_smorAnalyses; private static final String GET_ALL_NOUN_ORTH_FORMS_WITHOUT_HYPHEN_QUERY = "SELECT distinct l.orth_form AS orth_form " + "FROM lex_unit_table l, synset_table s WHERE l.synset_id = s.id " + "and s.word_category_id = 1 and length(l.orth_form) > 3 " + "and l.orth_form not like '% %' and l.orth_form not like '%-%' " + "order by l.orth_form"; public CompoundDeterminerSMOR(GermaNet germaNet, Connection connection) throws SQLException, IOException { this.germaNet = germaNet; this.connection = connection; prepareSMORAnalysis(); // only needed to extract new noun list from GermaNet determineSMORResults(); } private void prepareSMORAnalysis() throws SQLException, IOException { CompoundDeterminer.writeFile(allNounsInGermaNet, "", false); CompoundDeterminer.writeFile(smorResultsLatin1, "", false); PreparedStatement findNounsStatement = connection.prepareStatement(GET_ALL_NOUN_ORTH_FORMS_WITHOUT_HYPHEN_QUERY); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { CompoundDeterminer.writeFile(allNounsInGermaNet, results.getString("orth_form") + "\n", true, CompoundDeterminer.latin1Encoding); } results.close(); String smorCommand = pathToSMORTool + "src/fst-infl2 -d -t " + pathToSMORTool + "lib/smor.ca -t " + pathToSMORTool + "lib/smor-cap.ca -t " + pathToSMORTool + "lib/smor-uc.ca -t " + pathToSMORTool + "lib/smor-ss.ca -t " + pathToSMORTool + "lib/smor-ascii.ca " + pathToSMORTool + "lib/smor-guesser.ca " + CompoundDeterminer.resultsFolder + allNounsInGermaNet + " " + CompoundDeterminer.resultsFolder + smorResultsLatin1; try { Process smorProcess = Runtime.getRuntime().exec(smorCommand); smorProcess.waitFor(); CompoundDeterminer.writeFile(smorResultsUTF8, "", false); Scanner scanner = new Scanner(new FileInputStream(CompoundDeterminer.resultsFolder + smorResultsLatin1), CompoundDeterminer.latin1Encoding); try { while (scanner.hasNextLine()){ String line = scanner.nextLine(); CompoundDeterminer.writeFile(smorResultsUTF8, line + "\n", true); } } finally { scanner.close(); } } catch (InterruptedException ex) { Logger.getLogger(CompoundDeterminer.class.getName()).log(Level.SEVERE, null, ex); } } private void determineSMORResults() throws SQLException, IOException { Scanner scanner = new Scanner(new FileInputStream(CompoundDeterminer.resultsFolder + smorResultsUTF8), CompoundDeterminer.utf8Encoding); nouns_smorAnalyses = new HashMap>(); try { String currentOrthForm = ""; List smorAnalyses = new ArrayList(); while (scanner.hasNextLine()){ String line = scanner.nextLine(); if (line.startsWith("<")) { line = line.substring(line.indexOf(">") + 1); } if (line.startsWith("> ")) { if (smorAnalyses.size() > 0) { // System.out.println("nouns_smorAnalyses.put:" + currentOrthForm + " " + smorAnalyses.get(0)); nouns_smorAnalyses.put(currentOrthForm, smorAnalyses); smorAnalyses = new ArrayList(); } currentOrthForm = line.substring(2); // System.out.println("\ncurrentOrthForm=" + currentOrthForm); } else if (!line.startsWith("no result for ")) { if (line.matches(".*.*")) { // System.out.println("pref"); int prefIndex = line.indexOf(""); if (prefIndex >= 0) { // System.out.println("line.substring(prefIndex + 6)=" + line.substring(prefIndex + 6, prefIndex + 7)); if (line.substring(prefIndex + 6, prefIndex + 7).matches("[A-ZÄÖÜ]")) { String tmpLine = line; // System.out.println("line1=" + line); line = ""; int wordStart = tmpLine.substring(0, prefIndex).lastIndexOf(">") + 1; if (wordStart < 0) { wordStart = 0; } else { line += tmpLine.substring(0, wordStart); } int wordEnd = tmpLine.substring(prefIndex + 6).indexOf("<") - 1; if (wordEnd < 0) { wordEnd = tmpLine.length() - 1; } // System.out.println("start=" + wordStart + ", end=" + wordEnd); // System.out.println(line.substring(wordStart, wordStart + 1).toUpperCase()); // System.out.println(line.substring(wordStart + 1, prefIndex)); // System.out.println(line.substring(prefIndex + 6, prefIndex + 6 + wordEnd).toLowerCase()); // System.out.println(line.substring( prefIndex + 6 + wordEnd)); line += tmpLine.substring(wordStart, wordStart + 1).toUpperCase() + tmpLine.substring(wordStart + 1, prefIndex) + tmpLine.substring(prefIndex + 6, prefIndex + 6 + wordEnd).toLowerCase() + tmpLine.substring(prefIndex + 6 + wordEnd); // System.out.println("line2=" + line); } else { // System.out.println("line3=" + line); line = line.replaceAll("", ""); // System.out.println("line4=" + line); } } } line = line.replaceAll("", "").replaceAll("", "");//.replaceAll("", "PREF"); line = line.replaceAll("", "").replaceAll("", "").replaceAll("", ""); line = line.replaceAll("", "").replaceAll("", "").replaceAll("", "").replaceAll("", ""); line = line.replaceAll("", "").replaceAll("", ""); line = line.replaceAll("", "").replaceAll("", "").replaceAll("", ""); line = line.replaceAll("", "").replaceAll("", "").replaceAll("", ""); // if (line.split(">(\\w|\\ä|\\ö|\\ü)").length == 2) { // line = line.replaceAll(">er<\\+NN>", ">").replaceAll(">ler<\\+NN>", ">");; // } String[] splittedLine = line.split(">(\\w|\\ä|\\ö|\\ü)"); String smorResult = ""; Integer index = -1; if (splittedLine.length > 1) { for (int i = 0; i < splittedLine.length; i++) { // System.out.println("smor: " + splittedLine[i] + " " + splittedLine[i].matches(".*PREF.*")); // System.out.println(splittedLine[i].substring(splittedLine[i].indexOf("PREF") + 5, 1)); // System.out.println(splittedLine[i].substring(splittedLine[i].indexOf("PREF") + 5, 1).matches("[A-ZÄÖÜ]")); // int prefIndex = splittedLine[i].indexOf("PREF"); // if (prefIndex >= 0 // && splittedLine[i].substring(prefIndex + 4, prefIndex + 5).matches("[A-ZÄÖÜ]")) { // System.out.print("smor PREF " + splittedLine[i]); // splittedLine[i] = splittedLine[i].substring(0, 1).toUpperCase() // + splittedLine[i].substring(1, splittedLine[i].indexOf("<")).replace("PREF", "").toLowerCase() // + splittedLine[i].substring(splittedLine[i].indexOf("<")); // System.out.println(" --> " + splittedLine[i]); // } index = splittedLine[i].length() + 1; if (splittedLine.length > 2 && i > 0 && i < splittedLine.length - 1) { index++; } String nounEnding = ""; int nounEndingWithTagsLength = 0; if (i < splittedLine.length - 1) { // if (line.substring(index).startsWith("barkeit<+NN>")) { // nounEnding = "barkeit"; // nounEndingWithTagsLength = "barkeit<+NN>".length(); // } else if (line.substring(index).startsWith("barkeit")) { // nounEnding = "barkeit"; // nounEndingWithTagsLength = "barkeit".length(); if (line.substring(index).startsWith("bar")) { nounEnding = "bar"; nounEndingWithTagsLength = "bar".length(); } else if (line.substring(index).startsWith("tum<+NN>")) { nounEnding = "tum"; nounEndingWithTagsLength = "tum<+NN>".length(); } else if (line.substring(index).startsWith("ung<+NN>")) { nounEnding = "ung"; nounEndingWithTagsLength = "ung<+NN>".length(); } else if (line.substring(index).startsWith("ung")) { nounEnding = "ung"; nounEndingWithTagsLength = "ung".length(); } else if (line.substring(index).startsWith("ie<+NN>")) { nounEnding = "ie"; nounEndingWithTagsLength = "ie<+NN>".length(); } else if (line.substring(index).startsWith("er<+NN>")) { nounEnding = "er"; nounEndingWithTagsLength = "er<+NN>".length(); } else if (line.substring(index).startsWith("er")) { nounEnding = "er"; nounEndingWithTagsLength = "er".length(); } else if (line.substring(index).startsWith("ler")) { nounEnding = "ler"; nounEndingWithTagsLength = "ler".length(); } else if (line.substring(index).startsWith("ler<+NN>")) { nounEnding = "ler"; nounEndingWithTagsLength = "ler<+NN>".length(); } else if (line.substring(index).startsWith("keit<+NN>")) { nounEnding = "keit"; nounEndingWithTagsLength = "keit<+NN>".length(); } else if (line.substring(index).startsWith("keit")) { nounEnding = "keit"; nounEndingWithTagsLength = "keit".length(); } else if (line.substring(index).startsWith("heit<+NN>")) { nounEnding = "heit"; nounEndingWithTagsLength = "heit<+NN>".length(); } else if (line.substring(index).startsWith("chen<+NN>")) { nounEnding = "chen"; nounEndingWithTagsLength = "chen<+NN>".length(); } else if (line.substring(index).startsWith("igkeit<+NN>")) { nounEnding = "igkeit"; nounEndingWithTagsLength = "igkeit<+NN>".length(); } else if (line.substring(index).startsWith("schaft<+NN>")) { nounEnding = "schaft"; nounEndingWithTagsLength = "schaft<+NN>".length(); } else if (line.substring(index).startsWith("erei<+NN>")) { nounEnding = "erei"; nounEndingWithTagsLength = "erei<+NN>".length(); } else if (line.substring(index).startsWith("ei<+NN>")) { nounEnding = "ei"; nounEndingWithTagsLength = "ei<+NN>".length(); } } if (!nounEnding.equals("")) { // System.out.println("line=" + line + ", indexof=" + line.substring(0, line.indexOf("<") - 2)); int wordLength = line.indexOf("<"); int start = 0; if (i > 0) { int cut = 2; while (wordLength > cut && start < 1) { start = currentOrthForm.toLowerCase().indexOf(line.substring(0, wordLength - cut).toLowerCase()); cut++; } } int end = -1; if (i == splittedLine.length - 2) { end = currentOrthForm.length(); } else if (currentOrthForm.indexOf(nounEnding, start + wordLength - nounEnding.length()) != -1) { end = currentOrthForm.indexOf(nounEnding, start + wordLength - nounEnding.length()) + nounEnding.length(); } // System.out.println("start=" + start + ", end=" + end + ", wordlength=" + wordLength + ", nounEnding=" + nounEnding); if (start == -1) { smorResult += line.substring(0, index) + nounEnding; } else if (end == -1) { smorResult += line.substring(0, index + nounEndingWithTagsLength); } else { smorResult += currentOrthForm.substring(start, start+1).toUpperCase() + currentOrthForm.substring(start+1, end) + ""; } // System.out.println("smorresult=" + smorResult); index += nounEndingWithTagsLength; i++; } else { smorResult += line.substring(0, index); } if (i < splittedLine.length - 1) { smorResult += " "; } line = line.substring(index); } smorResult = smorResult.replaceAll(" in<\\+NN>", "in<+NN>"); smorResult = smorResult.replaceAll(" in<\\+NN>", "in<+NN>"); smorResult = smorResult.replaceAll(" keit<\\+NN>", "keit<+NN>"); smorResult = smorResult.replaceAll(" keit", "keit"); if (!smorAnalyses.contains(smorResult)) { // System.out.println(currentOrthForm + ": " + smorResult); smorAnalyses.add(smorResult); } } } } } finally { scanner.close(); } } /** * Probleme mit SMOR: * - Amtsärztin --> Amt+Arztin * - Türklinke --> Türe+Klinke * - viele Wörter, die es als Nomen und Verben gibt, werden als Verben angegeben (Denominalisierung), * Bsp: -ung, -keit, -heit * - Festmacherleine --> festmachener+Leine * * @param id * @param noun * @param smorAnalyses * @return * @throws IOException * @throws SQLException */ public CompoundDBEntry determineCompoundWithSMOR(int id, String noun) throws IOException, SQLException { List smorAnalyses = nouns_smorAnalyses.get(noun); String modifier = ""; String head = ""; String comment = ""; if (smorAnalyses != null) { if (smorAnalyses.size() == 1) { // only one SMOR result String[] smorResult = smorAnalyses.get(0).split(" "); if (smorResult.length == 2) { // only one binary SMOR result modifier = smorResult[0].split("<")[0]; head = smorResult[1].split("<")[0]; comment = "SMOR: only one binary result: " + smorAnalyses.get(0); CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, (new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, comment, germaNet); } else if (smorResult.length > 2) { // only one non-binary SMOR result comment = "SMOR: only one non binary result: " + smorAnalyses.get(0); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, noun + " " + comment + "\n", true); return new CompoundDBEntry(noun, modifier, head, id, comment); } } else { // more than one SMOR results List smorAnalysesWithoutRegardingTags = new ArrayList(); List smorAnalysesWithoutRegardingTags_withTags = new ArrayList(); List smorAnalysesBinaryOnly = new ArrayList(); List smorAnalysesBinaryOnly_withTags = new ArrayList(); // filter SMOR results that are equal when tags are not regarded and // extract binary results for (String smorResult : smorAnalyses) { String smorResultWithoutTags = smorResult.replaceAll("<[^>]+>", ""); if (!smorAnalysesWithoutRegardingTags.contains(smorResultWithoutTags)) { smorAnalysesWithoutRegardingTags.add(smorResultWithoutTags); smorAnalysesWithoutRegardingTags_withTags.add(smorResult); if (smorResultWithoutTags.split(" ").length == 2) { smorAnalysesBinaryOnly.add(smorResultWithoutTags); smorAnalysesBinaryOnly_withTags.add(smorResult); } } // CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile, // noun + ": " + smorResult + " -- " + smorResultWithoutTags + "\n", true); } if (smorAnalysesWithoutRegardingTags_withTags.size() == 1 && smorAnalysesBinaryOnly_withTags.size() == 1) { // only one binary result left after filtering String[] smorAnalyse = smorAnalysesWithoutRegardingTags_withTags.get(0).split(" "); modifier = smorAnalyse[0].split("<")[0]; head = smorAnalyse[1].split("<")[0]; comment = "SMOR: only one binary result (without regarding tags): " + smorAnalysesBinaryOnly_withTags.get(0); CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, (new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, comment, germaNet); } else if (smorAnalysesWithoutRegardingTags_withTags.size() == smorAnalysesBinaryOnly_withTags.size()) { // more than one (only binary) results left after filtering List smorAnalysesPotentialHeads = new ArrayList(); List smorAnalysesPotentialModifiers = new ArrayList(); // extract potential heads and modifiers for (String smorResult_withoutTags : smorAnalysesBinaryOnly) { if (!smorAnalysesPotentialHeads.contains(smorResult_withoutTags.split(" ")[1])) { smorAnalysesPotentialHeads.add(smorResult_withoutTags.split(" ")[1]); } if (!smorAnalysesPotentialModifiers.contains(smorResult_withoutTags.split(" ")[0])) { smorAnalysesPotentialModifiers.add(smorResult_withoutTags.split(" ")[0]); } } // extract headHypernym DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(id), smorAnalysesPotentialHeads, germaNet); String headHypernymString = ""; if (headHypernym != null && headHypernym.getDistance() < 5) { headHypernymString = headHypernym.getHypernym().getOrthForm(); } // extract modifierHypernym DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(id), smorAnalysesPotentialHeads, germaNet); String modifierHypernymString = ""; if (modifierHypernym != null && modifierHypernym.getDistance() < 5) { modifierHypernymString = modifierHypernym.getHypernym().getOrthForm(); } // extract matches for headHypernym and modifierHypernym List headHypernymMatch = new ArrayList(); List modifierHypernymMatch = new ArrayList(); for (String smorResult_withTags : smorAnalysesBinaryOnly_withTags) { if (smorResult_withTags.split(" ")[1].split("<")[0].equals(headHypernymString)) { headHypernymMatch.add(smorResult_withTags); } if (smorResult_withTags.split(" ")[0].split("<")[0].equals(modifierHypernymString)) { modifierHypernymMatch.add(smorResult_withTags); } } if (headHypernymMatch.size() == 1) { modifier = headHypernymMatch.get(0).split(" ")[0].split("<")[0]; head = headHypernymMatch.get(0).split(" ")[1].split("<")[0]; comment = "SMOR: head is hypernym: " + headHypernymMatch.get(0); CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, (new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, comment, germaNet); } if (modifierHypernymMatch.size() == 1) { modifier = modifierHypernymMatch.get(0).split(" ")[0].split("<")[0]; head = modifierHypernymMatch.get(0).split(" ")[1].split("<")[0]; comment = "SMOR: modifier is hypernym: " + modifierHypernymMatch.get(0); CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, (new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true); return new CompoundDBEntry(modifier, head, id, comment, germaNet); } // CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile, // noun + " (binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.size() + " " // + smorAnalysesWithoutRegardingTags.size() + " " // + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true); comment = "SMOR: binary after filtering: "; for (String string : smorAnalysesWithoutRegardingTags_withTags) { CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, noun + " (binary after filtering): " + string + "\n", true); comment += string + "; "; } return new CompoundDBEntry(noun, modifier, head, id, comment); } else if (smorAnalysesWithoutRegardingTags_withTags.size() == 1) { // only one (non-binary) result left after filtering // CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile, // noun + " (one non binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.size() + " " // + smorAnalysesWithoutRegardingTags.size() + " " // + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true); comment = "SMOR: one non binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.get(0); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, noun + " " + comment + "\n\n", true); return new CompoundDBEntry(noun, modifier, head, id, comment); } else { // more than one (non-binary) results left after filtering // CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile, // noun + " (more non binary): " + smorAnalysesWithoutRegardingTags_withTags.size() + " " // + smorAnalysesWithoutRegardingTags.size() + " " // + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true); comment = "SMOR: more non binary: "; for (String string : smorAnalysesWithoutRegardingTags_withTags) { CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, noun + " (more non binary): " + string + "\n", true); comment += string + "; "; } return new CompoundDBEntry(noun, modifier, head, id, comment); } } } return new CompoundDBEntry(noun, modifier, head, id, comment); } public void setFileEnding(String fileEnding) throws IOException { this.fileEnding = fileEnding; CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, "", false); CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, "", false); } }