package decompounder; import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; /** * * @author Verena Henrich */ public class CompoundDeterminer { private final Connection connection; private static final String username = "germanet"; private static final String password = "germanet"; private final GermaNet germaNetCaseInsensitive; private final GermaNet germaNet; // public static String databaseUrl = "jdbc:postgresql://localhost:5432/germanet"; public static String databaseUrl = "jdbc:postgresql://localhost:5432/germanet13a"; private String compoundsWithoutHyphensFinalFileName = "compounds_without_hyphens_"; private String compoundsWithoutHyphensToReviewFileName = "compounds_without_hyphens_to_review"; private String fileEnding = ".txt"; private String nonCompoundsFinalFile = "non_compounds"; private String nonCompoundsToReviewFile = "non_compounds_to_review"; public static int twoBinaryCompoundsAreInGN = 0; public static String latin1Encoding = "ISO-8859-1"; // ISO-8859-1 == Latin-1 public static String utf8Encoding = "UTF-8"; private CompoundDeterminerASV compoundDeterminerASV; private CompoundDeterminerGN compoundDeterminerGN; private CompoundDeterminerSMOR compoundDeterminerSMOR; public static String resultsFolder = "src/test/resources/results" + File.separatorChar;// + "2013-05-31_all_new_lexunits" + File.separatorChar; private static final String GET_NOUNS_WITHOUT_HYPHEN_QUERY = "SELECT l.orth_form AS orth_form, l.id AS id " + "FROM lex_unit_table l, synset_table s WHERE l.synset_id = s.id " + "and s.word_category_id = 1 and length(l.orth_form) > 3 " + "and l.orth_form not like '% %' and l.orth_form not like '%-%' " + "and l.id between ? and ?" + " order by l.id"; public static void main(String[] args) throws IOException, Exception { CompoundDeterminer compoundDeterminer = new CompoundDeterminer(); /****************************************** * For creating a new compounds list, * * you need to specify the relevant range * * of lexical units (by their IDs) * ******************************************/ int fromLexUnitId = 118719; //118719 int toLexUnitId = 118819; //125687 141904 compoundDeterminer.processCompoundsWithHyphens(fromLexUnitId, toLexUnitId); compoundDeterminer.processNonCompoundsWithEmptySpace(fromLexUnitId, toLexUnitId); compoundDeterminer.processCompoundsWithoutHyphens(fromLexUnitId, toLexUnitId); } public CompoundDeterminer() throws Exception { germaNetCaseInsensitive = new GermaNet(username, password, databaseUrl, true); germaNet = new GermaNet(username, password, databaseUrl, false); try { this.connection = DriverManager.getConnection(databaseUrl, username, password); } catch (SQLException ex) { throw new Exception(ex); } compoundDeterminerASV = new CompoundDeterminerASV(germaNet); compoundDeterminerGN = new CompoundDeterminerGN(germaNet, connection); compoundDeterminerSMOR = new CompoundDeterminerSMOR(germaNet, connection); } public static void writeFile(String fileName, String content, boolean append) throws IOException { writeFile(fileName, content, append, utf8Encoding); } public static void writeFile(String fileName, String content, boolean append, String encoding) throws IOException { // this would produce temporary "in-between" files that are never needed boolean writeTmpFiles = false; if (!writeTmpFiles && (fileName.contains("compounds_determined_with_") || fileName.contains("_to_review_"))) { return; } try { Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( new File(resultsFolder + fileName), append), encoding)); writer.write(content); writer.close(); } catch (Exception ex) { System.err.println(ex.getMessage()); } } private Map getNouns(int fromLexUnitId, int toLexUnitId) throws SQLException { Map nouns = new LinkedHashMap(); PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITHOUT_HYPHEN_QUERY); findNounsStatement.setInt(1, fromLexUnitId); findNounsStatement.setInt(2, toLexUnitId); ResultSet results = findNounsStatement.executeQuery(); while (results.next()) { // System.out.println(results.getInt("id") + " " + results.getString("orth_form")); nouns.put(results.getInt("id"), results.getString("orth_form")); } results.close(); return nouns; } private void processCompoundsWithoutHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException { // while (//fromLexUnitId < germaNet.getNumLexUnits() && // toLexUnitId < 125688) { Map nouns = getNouns(fromLexUnitId, toLexUnitId); fileEnding = "_from_" + fromLexUnitId + "_to_" + toLexUnitId + ".txt"; System.out.println("fileEnding=" + fileEnding); // writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, CompoundDBEntry.COPY_INTO_QUERY, false); writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, "", false); // writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, CompoundDBEntry.COPY_INTO_QUERY, false); writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "", false); writeFile(nonCompoundsFinalFile + fileEnding, "", false); writeFile(nonCompoundsToReviewFile + fileEnding, "", false); compoundDeterminerSMOR.setFileEnding(fileEnding); compoundDeterminerGN.setFileEnding(fileEnding); compoundDeterminerASV.setFileEnding(fileEnding); // go through all extracted nouns Iterator it = nouns.entrySet().iterator(); while (it.hasNext()) { Map.Entry entry = (Map.Entry)it.next(); String noun = entry.getValue(); int id = entry.getKey(); CompoundDBEntry smorCompound = compoundDeterminerSMOR.determineCompoundWithSMOR(id, noun); CompoundDBEntry gnCompound = compoundDeterminerGN.determineCompoundWithGermaNet(id, noun); CompoundDBEntry asvCompound = compoundDeterminerASV.determineCompoundWithASV(id, noun); CompoundDBEntry smorCompoundClone = new CompoundDBEntry(noun, smorCompound.getModifier(), smorCompound.getHead(), id, smorCompound.getComment()); CompoundDBEntry gnCompoundClone = new CompoundDBEntry(noun, gnCompound.getModifier(), gnCompound.getHead(), id, gnCompound.getComment()); CompoundDBEntry asvCompoundClone = new CompoundDBEntry(noun, asvCompound.getModifier(), asvCompound.getHead(), id, asvCompound.getComment()); // System.out.println(smorCompound.toSQLString()); // System.out.println(gnCompound.toSQLString()); // System.out.println(asvCompound.toSQLString()); HashMap changeModifiers = new HashMap(); changeModifiers.put("Neben", "neben"); changeModifiers.put("Gegen", "gegen"); changeModifiers.put("Haupt", "haupt"); changeModifiers.put("General", "general"); changeModifiers.put("Rück", "rück"); changeModifiers.put("Elektro", "elektro"); changeModifiers.put("Ober", "ober"); changeModifiers.put("Spitz", "spitz"); changeModifiers.put("Mini", "mini"); changeModifiers.put("Sonder", "sonder"); changeModifiers.put("Brutto", "brutto"); changeModifiers.put("Netto", "netto"); changeModifiers.put("Bio", "bio"); changeModifiers.put("Über", "über"); changeModifiers.put("Audio", "audio"); changeModifiers.put("Midi", "midi"); changeModifiers.put("Lokal", "lokal"); changeModifiers.put("Mikro", "mikro"); changeModifiers.put("Makro", "makro"); changeModifiers.put("Mittel", "mittel"); changeModifiers.put("Meta", "meta"); changeModifiers.put("Tief", "tief"); changeModifiers.put("Zwischen", "zwischen"); changeModifiers.put("Zweit", "zweit"); changeModifiers.put("einzel", "einzeln"); changeModifiers.put("Einzel", "einzeln"); changeModifiers.put("doppel", "doppelt"); changeModifiers.put("Doppel", "doppelt"); changeModifiers.put("erst", "erste"); changeModifiers.put("Putz", "putzen"); changeModifiers.put("Spann", "spannen"); changeModifiers.put("Warte", "warten"); changeModifiers.put("Fernseh", "Fernsehen"); if (changeModifiers.containsKey(smorCompound.getModifier())) { smorCompound.setModifier(changeModifiers.get(smorCompound.getModifier())); } if (changeModifiers.containsKey(asvCompound.getModifier())) { asvCompound.setModifier(changeModifiers.get(asvCompound.getModifier())); } if (changeModifiers.containsKey(gnCompound.getModifier())) { gnCompound.setModifier(changeModifiers.get(gnCompound.getModifier())); } if (smorCompound.getHead().matches("[a-zäöü].*")) { // System.out.println(noun + " " + id + ": smor-head small case: " + smorCompound.getHead()); smorCompound.setHead(""); } else if (!noun.endsWith(smorCompound.getHead().toLowerCase())) { // System.out.println(noun + " " + id + ": smor-head does not equal noun: " + smorCompound.getHead()); smorCompound.setHead(""); } if (gnCompound.getHead().matches("[a-zäöü].*")) { // System.out.println(noun + " " + id + ": gn-head small case: " + gnCompound.getHead()); gnCompound.setHead(""); } else if (!noun.endsWith(gnCompound.getHead().toLowerCase())) { // System.out.println(noun + " " + id + ": gn-head does not equal noun: " + gnCompound.getHead()); gnCompound.setHead(""); } if (asvCompound.getHead().matches("[a-zäöü].*")) { // System.out.println(noun + " " + id + ": asv-head small case: " + asvCompound.getHead()); asvCompound.setHead(""); } else if (!noun.endsWith(asvCompound.getHead().toLowerCase())) { // System.out.println(noun + " " + id + ": asv-head does not equal noun: " + asvCompound.getHead()); asvCompound.setHead(""); } if (noun.endsWith("machung")) { // System.out.println(noun + ": -machung"); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'machung\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } else if (noun.endsWith("igkeit") && !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("igkeit"))).isEmpty()) { // System.out.println(noun + ": existing word + igkeit"); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'igkeit\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); // return new CompoundDBEntry("", "", "", id, "GN: existing word + \'igkeit\'"); } else if (noun.endsWith("keit") && !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("keit"))).isEmpty()) { // System.out.println(noun + ": existing word + keit"); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-keit\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); // return new CompoundDBEntry("", "", "", id, "GN: existing word + \'keit\'"); } else if (noun.endsWith("keit") && !noun.endsWith("fertigkeit") && !noun.endsWith("fähigkeit") && !noun.endsWith("tätigkeit") && !noun.endsWith("geschwindigkeit") && !noun.endsWith("wahrscheinlichkeit") && !noun.endsWith("persinlichkeit") && !noun.endsWith("möglichkeit")) { // System.out.println(noun + ": -keit"); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-keit\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); // return new CompoundDBEntry("", "", "", id, "GN: ends with \'keit\'"); } else if (noun.endsWith("heit") && !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("heit"))).isEmpty()) { // System.out.println(noun + ": existing word + heit"); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-heit\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); // return new CompoundDBEntry("", "", "", id, "GN: existing word + \'heit\'"); } else if (smorCompound.getHead().equals("") && gnCompound.getHead().equals("") && asvCompound.getHead().equals("")) { if (noun.endsWith("keit") || noun.endsWith("heit") || noun.endsWith("ität") || noun.endsWith("ung") || noun.endsWith("tum") || noun.endsWith("schaft") || noun.endsWith("tion")) { writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null; noun ends with \'-keit\'/\'-heit\'/\'-ität\'/\'-ung\'/\'-tum\'/\'-schaft\'/\'-tion\'\n", true); if (!smorCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true); } if (!gnCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.getComment() + "\n", true); } if (!asvCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true); } writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; noun ends with \'-keit\'/\'-heit\'; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } else if ((!smorCompound.getComment().equals("") && !gnCompound.getComment().equals("")) || (!asvCompound.getComment().equals("") && !gnCompound.getComment().equals("")) || !asvCompound.getComment().equals("") || !smorCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null\n", true); if (!smorCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true); } if (!gnCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.getComment() + "\n", true); } if (!asvCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.getComment() + "\n", true); } writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true); // writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true); writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; Compound induced, but not splitted\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } else { writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null\n", true); if (!smorCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true); } if (!gnCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.getComment() + "\n", true); } if (!asvCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true); } writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } } else if (smorCompound.getHead().equals("") && gnCompound.getHead().equals("")) { // if (!smorCompound.getComment().equals("") || !gnCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR and GN are null\n", true); writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.toSQLString() + "\n", true); if (!smorCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true); } if (!gnCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.getComment() + "\n", true); } writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true); // writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true); writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and GN are null; Compound induced, but not splitted\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); // } else { // writeFile(nonCompoundsFile + fileEnding, id + "\t" + noun + ": SMOR and GN are null\n", true); // writeFile(nonCompoundsFile + fileEnding, asvCompound.toSQLString() + "\n", true); // if (!smorCompound.getComment().equals("")) { // writeFile(nonCompoundsFile + fileEnding, smorCompound.getComment() + "\n", true); // } // if (!gnCompound.getComment().equals("")) { // writeFile(nonCompoundsFile + fileEnding, gnCompound.getComment() + "\n", true); // } // writeFile(nonCompoundsFile + fileEnding, "\n", true); // } } else if (gnCompound.getHead().equals("") && asvCompound.getHead().equals("")) { CompoundDBEntry compound = smorCompound; compound.setComment("GN and ASV are null; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else if (smorCompound.getHead().equals("") && asvCompound.getHead().equals("")) { if (gnCompound.isModifierInGermaNet() && gnCompound.getHeadHypernymDistance() < 5 && gnCompound.getHeadHypernymDistance() > 0) { CompoundDBEntry compound = gnCompound; compound.setComment("SMOR and ASV are null; " + gnCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else if (!smorCompound.getComment().equals("") || !asvCompound.getComment().equals("") // || gnCompound.getComment().equals("GN: head is hypernym") // || gnCompound.getComment().equals("GN: head is part-whole related") ) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR and ASV are null\n", true); writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.toSQLString() + "\n", true); if (!smorCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true); } if (!asvCompound.getComment().equals("")) { writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.getComment() + "\n", true); } writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true); // writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true); writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and ASV are null; Compound induced, but not splitted\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } else { writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR and ASV are null\n", true); writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.toSQLString() + "\n", true); if (!smorCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true); } if (!asvCompound.getComment().equals("")) { writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true); } writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true); writeFile(nonCompoundsFinalFile + fileEnding, "np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and ASV are null; No compound induced\t" + smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t" + asvCompoundClone.toSmallSQLString() + "\n", true); } } else if (asvCompound.getHead().equals("")) { if (smorCompound.equals(gnCompound)) { CompoundDBEntry compound = smorCompound; compound.setComment("SMOR and GN agree, ASV is null; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound); if (compound != null) { compound.setComment("SMOR and GN do not agree, ASV is null; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = smorCompound; compound.setComment("SMOR and GN do not agree, ASV is null; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } } else if (gnCompound.getHead().equals("")) { if (smorCompound.equals(asvCompound)) { CompoundDBEntry compound = smorCompound; compound.setComment("SMOR and ASV agree, GN is null; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound); if (compound != null) { compound.setComment("SMOR and ASV do not agree, GN is null; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = smorCompound; compound.setComment("SMOR and ASV do not agree, GN is null; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } } else if (smorCompound.getHead().equals("")) { if (asvCompound.equals(gnCompound)) { CompoundDBEntry compound = gnCompound; compound.setComment("GN and ASV agree, SMOR is null; " + gnCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { CompoundDBEntry compound = determineIfOneCompoundOutperformsTheOther(gnCompound, asvCompound); if (compound != null) { compound.setComment("GN and ASV do not agree, SMOR is null; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = gnCompound; compound.setComment("GN and ASV do not agree, SMOR is null; " + gnCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); // writeFile(nonCompoundsFile + fileEnding, id + "\t" + noun + ": GN and ASV do not agree, SMOR is null\n", true); // writeFile(nonCompoundsFile + fileEnding, gnCompound.toSQLString(), true); // writeFile(nonCompoundsFile + fileEnding, asvCompound.toSQLString() + "\n", true); } } } else if (smorCompound.equals(gnCompound) && smorCompound.equals(asvCompound)) { CompoundDBEntry compound = smorCompound; compound.setComment("SMOR, GN, and ASV agree; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else if (smorCompound.equals(gnCompound) && !smorCompound.equals(asvCompound)) { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound); if (compound != null) { // System.out.println("1"); compound.setComment("SMOR and GN agree, ASV not; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { // System.out.println("2"); compound = smorCompound; compound.setComment("SMOR and GN agree, ASV not; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } else if (smorCompound.equals(asvCompound) && !smorCompound.equals(gnCompound)) { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound); if (compound != null) { compound.setComment("SMOR and ASV agree, GN not; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = smorCompound; compound.setComment("SMOR and ASV agree, GN not; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } else if (gnCompound.equals(asvCompound) && !gnCompound.equals(smorCompound)) { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound); if (compound != null) { compound.setComment("GN and ASV agree, SMOR not; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = gnCompound; compound.setComment("GN and ASV agree, SMOR not; " + gnCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } else { CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound); if (compound == null) { compound = determineIfOneCompoundOutperformsTheOther(smorCompound, gnCompound); if (compound == null) { compound = determineIfOneCompoundOutperformsTheOther(gnCompound, asvCompound); } } else if (determineIfOneCompoundOutperformsTheOther(compound, gnCompound) != null) { compound = determineIfOneCompoundOutperformsTheOther(compound, gnCompound); } if (compound != null) { compound.setComment("SMOR, GN, and ASV do not agree; " + compound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } else { compound = smorCompound; compound.setComment("SMOR, GN, and ASV do not agree; " + smorCompound.getComment()); writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone); } } } // writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, "\\.\n", true); // writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\\.\n", true); // fromLexUnitId = toLexUnitId + 1; // toLexUnitId += 5000; // } } private void writeFinalResultInCorrectFile(CompoundDBEntry compound, CompoundDBEntry smorCompound, CompoundDBEntry gnCompound, CompoundDBEntry asvCompound) throws IOException { // if (compound.getModifierId() > 0 && compound.getModifierHypernymDistance() < 0) { // writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, compound.toSQLString(), true); // } else { /* if (smorComment.getComment().equals("")) { smorComment = "\\N"; } if (gnComment.equals("")) { gnComment = "\\N"; } if (asvComment.equals("")) { asvComment = "\\N"; }*/ writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, compound.toSQLString() // + "\t" + smorCompound.toSmallSQLString() + "\t" // + gnCompound.toSmallSQLString() + "\t" + asvCompound.toSmallSQLString() + "\n", true); // } } private void processCompoundsWithHyphens() throws SQLException, IOException { compoundDeterminerGN.processCompoundsWithHyphens(); } private void processCompoundsWithHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException { compoundDeterminerGN.processCompoundsWithHyphens(fromLexUnitId, toLexUnitId); } private void processNonCompoundsWithEmptySpace() throws SQLException, IOException { compoundDeterminerGN.processNonCompoundsWithEmptySpace(); } private void processNonCompoundsWithEmptySpace(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException { compoundDeterminerGN.processNonCompoundsWithEmptySpace(fromLexUnitId, toLexUnitId); } private CompoundDBEntry determineIfSmorOutperformsOtherCompound(CompoundDBEntry smorCompound, CompoundDBEntry otherCompound) { String smorComment = smorCompound.getComment().substring(smorCompound.getComment().lastIndexOf(":") + 2); // System.out.println("smorComment=" + smorComment); if (smorCompound.getHead().equals(otherCompound.getHead()) && !smorCompound.getModifier().equals(otherCompound.getModifier())) { smorComment = smorComment.split(" ")[0]; // System.out.println("smorComment1=" + smorComment); if ((smorComment.contains("") || smorComment.contains("")) && otherCompound.getHead().substring(0, 1).matches("[A-Z]")) { return smorCompound; } } else if (smorCompound.getModifier().equals(otherCompound.getModifier()) && !smorCompound.getHead().equals(otherCompound.getHead())) { smorComment = smorComment.substring(smorComment.lastIndexOf(" ")); // System.out.println("smorComment2=" + smorComment); // System.out.println(otherCompound.getHead() + " " + otherCompound.getHead().matches("[a-z].*")); if ((smorComment.contains("") || smorComment.contains(""))) { return otherCompound; } else if (otherCompound.getHead().matches("[a-zäöü].*") && !smorCompound.getHead().matches("[a-zäöü].*")) { return smorCompound; } else if (!otherCompound.getHead().matches("[a-zäöü].*") && smorCompound.getHead().matches("[a-zäöü].*")) { return otherCompound; } else if (smorCompound.isHeadInGermaNet() && !otherCompound.isHeadInGermaNet()) { return smorCompound; } else if (!smorCompound.isHeadInGermaNet() && otherCompound.isHeadInGermaNet()) { return otherCompound; // } else if (smorCompound.getHeadId() > 0 && otherCompound.getHeadId() < 0) { // return smorCompound; // } else if (smorCompound.getHeadId() < 0 && otherCompound.getHeadId() > 0) { // return otherCompound; // } else if (smorCompound.getHeadId() < otherCompound.getHeadId()) { // return smorCompound; // } else if (smorCompound.getHeadId() > otherCompound.getHeadId()) { // return otherCompound; // } else if (smorCompound.getHeadHypernymDistance() > 0 && otherCompound.getHeadHypernymDistance() < 0) { // return smorCompound; // } else if (smorCompound.getHeadHypernymDistance() < 0 && otherCompound.getHeadHypernymDistance() > 0) { // return otherCompound; } else if (smorCompound.getHeadHypernymDistance() > otherCompound.getHeadHypernymDistance()) { // System.out.println(germaNet.getLexUnitByID(smorCompound.getCompoundId()).getOrthForm() + " " + smorCompound.getCompoundId() // + " has head hypernyms " + smorCompound.getHead() + " (correct) and " + otherCompound.getHead()); return smorCompound; } else if (smorCompound.getHeadHypernymDistance() < otherCompound.getHeadHypernymDistance()) { // System.out.println(germaNet.getLexUnitByID(smorCompound.getCompoundId()).getOrthForm() + " " + smorCompound.getCompoundId() // + " has head hypernyms " + smorCompound.getHead() + " and " + otherCompound.getHead() + " (correct)"); return otherCompound; } else if (smorCompound.getHead().length() > otherCompound.getHead().length()) { return smorCompound; } else if (smorCompound.getHead().length() < otherCompound.getHead().length()) { return otherCompound; } } return determineIfOneCompoundOutperformsTheOther(smorCompound, otherCompound); } private CompoundDBEntry determineIfOneCompoundOutperformsTheOther(CompoundDBEntry compound1, CompoundDBEntry compound2) { // System.out.println(compound1.toSQLString()); // System.out.println(compound2.toSQLString()); // System.out.println(compound1.isHeadInGermaNet() + " " + compound1.isModifierInGermaNet() + " " + // compound2.isHeadInGermaNet() + " " + compound2.isModifierInGermaNet()); if (compound1.isHeadInGermaNet() && compound1.isModifierInGermaNet() && !(compound2.isHeadInGermaNet() && compound2.isModifierInGermaNet())) { // System.out.println("A"); return compound1; } else if (!(compound1.isHeadInGermaNet() && compound1.isModifierInGermaNet()) && compound2.isHeadInGermaNet() && compound2.isModifierInGermaNet()) { // System.out.println("B"); return compound2; } else if (compound1.isHeadInGermaNet() && !compound2.isHeadInGermaNet()) { // System.out.println("C"); return compound1; } else if (!compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) { // System.out.println("D"); return compound2; // } else if (compound1.getHeadHypernymDistance() > 0 && compound2.getHeadHypernymDistance() < 0) { // return compound1; // } else if (compound1.getHeadHypernymDistance() < 0 && compound2.getHeadHypernymDistance() > 0) { // return compound2; } else if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) { // System.out.println(germaNet.getLexUnitByID(compound1.getCompoundId()).getOrthForm() + " " + compound1.getCompoundId() // + " has head hypernyms " + compound1.getHead() + " (correct) and " + compound2.getHead()); return compound1; // this is against common sense, but e.g.: //Müll+Verbrennungsanlage getHeadHypernymDistance: 1 //Müllverbrennung+Anlage getHeadHypernymDistance: 3 (this is correct binary splitting) // or: //leicht+ Metallgießerei getHeadHypernymDistance: 1 //Leichtmetall+Gießerei getHeadHypernymDistance: 2 (this is correct binary splitting) // this also explains, why we do not need to check if one head-hypernym is not -1 } else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) { // System.out.println(germaNet.getLexUnitByID(compound1.getCompoundId()).getOrthForm() + " " + compound1.getCompoundId() // + " has head hypernyms " + compound1.getHead() + " and " + compound2.getHead() + " (correct)"); return compound2; // this is against common sense, see above } else if (compound1.getModifierHypernymDistance() > 0 && compound2.getModifierHypernymDistance() < 0) { // System.out.println("E"); return compound1; } else if (compound1.getModifierHypernymDistance() < 0 && compound2.getModifierHypernymDistance() > 0) { // System.out.println("F"); return compound2; } else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) { // System.out.println("G"); return compound2; // the same that accounts for head-hypernyms (see above) might also account for modifier-hypernyms. // here, this is not verified, but done intuitively (not the same than for head-hypernym). } else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) { // System.out.println("H"); return compound2; } else if (compound1.isHeadInGermaNet() && !compound2.isHeadInGermaNet()) { // System.out.println("head of compound1 (" + compound1.getHead() // + ") is in GermaNet, but head of compound2 (" + compound2.getHead() + ") not)"); return compound1; } else if (!compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) { // System.out.println("head of compound2 (" + compound2.getHead() // + ") is in GermaNet, but head of compound1 (" + compound1.getHead() + ") not)"); return compound2; } else if (compound1.isModifierInGermaNet() && !compound2.isModifierInGermaNet()) { // System.out.println("modifier of compound1 (" + compound1.getModifier() // + ") is in GermaNet, but modifier of compound2 (" + compound2.getModifier() + ") not)"); return compound1; } else if (!compound1.isModifierInGermaNet() && compound2.isModifierInGermaNet()) { // System.out.println("modifier of compound2 (" + compound2.getModifier() // + ") is in GermaNet, but modifier of compound1 (" + compound1.getModifier() + ") not)"); return compound2; } else { // System.out.println("K"); return null; } } }