Commit 0b611765 authored by Ben Campbell's avatar Ben Campbell
Browse files

Initial commit

parents
File added
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry including="**/*.java" kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
/target/
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>Decompounder-mvn</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding//src/test/resources=UTF-8
encoding/<project>=UTF-8
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.6
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.tuebingen.uni.sfs.germanet</groupId>
<artifactId>Decompounder-mvn</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Decompounder</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<scm>
<url>https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn</url>
<connection>scm:git:https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn</connection>
<developerConnection>scm:git:https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn.git
</developerConnection>
<tag>HEAD</tag>
</scm>
<repositories>
<repository>
<id>sfs-clarind-nexus</id>
<url>http://t.weblicht.sfs.uni-tuebingen.de/nexus/content/repositories/releases</url>
</repository>
<repository>
<id>sfs-clarind-nexus-snapshot</id>
<url>http://t.weblicht.sfs.uni-tuebingen.de/nexus/content/repositories/snapshots</url>
</repository>
<repository>
<id>TU-Darmstadt</id>
<url>http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-releases/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>de.tuebingen.uni.sfs.germanet</groupId>
<artifactId>GernEdiT</artifactId>
<version>9.1</version>
</dependency>
<dependency>
<groupId>antlr</groupId>
<artifactId>antlr</artifactId>
<version>2.7.6</version>
</dependency>
<dependency>
<groupId>asm</groupId>
<artifactId>asm-attrs</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>asm</groupId>
<artifactId>asm</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>cglib</groupId>
<artifactId>cglib</artifactId>
<version>2.1_3</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>ejb3-persistence</artifactId>
<version>1.0.2.GA</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>8.3-606.jdbc3</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-annotations</artifactId>
<version>3.5.6-Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-commons-annotations</artifactId>
<version>3.2.0.Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-entitymanager</artifactId>
<version>3.5.6-Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-tools</artifactId>
<version>3.5.1.Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate</artifactId>
<version>3.5.4-Final</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
<version>3.20.0-GA</version>
</dependency>
<dependency>
<groupId>de.uni_leipzig.asv.toolbox</groupId>
<artifactId>toolbox-utils</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>de.uni_leipzig.asv.toolbox</groupId>
<artifactId>de.uni_leipzig.asv.toolbox.baseforms</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>javax.transaction</groupId>
<artifactId>jta</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-release-plugin</artifactId>
<version>2.4.2</version>
<dependencies>
<dependency>
<groupId>org.apache.maven.scm</groupId>
<artifactId>maven-scm-provider-gitexe</artifactId>
<version>1.8.1</version>
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.7</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<failOnError>false</failOnError>
<additionalparam>-Xdoclint:none</additionalparam>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.6</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>de.tuebingen.uni.sfs.germanet.editor.GermaNetEditorApp</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<filtering>false</filtering>
<directory>src/main/java</directory>
<includes>
<include>**</include>
</includes>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</resource>
<resource>
<filtering>false</filtering>
<directory>src/main/resources</directory>
<includes>
<include>**</include>
</includes>
</resource>
</resources>
</build>
</project>
\ No newline at end of file
File added
package decompounder;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDBEntry {
private String modifier;
private String head;
private int compoundId;
private int modifierId = -1;
private int headId = -1;
private String comment = "\\N";
private boolean modifierIsInGermaNet = false;
private boolean headIsInGermaNet = false;
private int headHypernymDistance = -1;
private int modifierHypernymDistance = -1;
private String compositionalType = "\\N";
private String compound = "\\N";
public static final String COMPOSITIONAL_TYPE_EXACT_MATCH = "exact match";
public static final String COMPOSITIONAL_TYPE_MODIFIER_E = "modifier e";
public static final String COMPOSITIONAL_TYPE_INTERFIX = "interfix";
public static final String COMPOSITIONAL_TYPE_HEAD_IS_HYPERNYM = "head is hypernym";
public static final String COPY_INTO_QUERY = "COPY compounds_table "
+ "(compound, compound_id, modifier, modifier_id, head, head_id, "
+ "modifier_in_gn, head_in_gn, modifier_hypernym_distance, "
+ "head_hypernym_distance, comment) FROM stdin;\n";
public CompoundDBEntry(String compound, String modifier, int modifierId, String head, int headId,
int compoundId, String comment, boolean modifierIsInGermaNet,
boolean headIsInGermaNet, int modifierHypernymDistance, int headHypernymDistance) {
this.compound = compound;
this.modifier = modifier;
this.head = head;
this.modifierId = modifierId;
this.headId = headId;
this.compoundId = compoundId;
this.comment = comment;
this.modifierIsInGermaNet = modifierIsInGermaNet;
this.headIsInGermaNet = headIsInGermaNet;
this.modifierHypernymDistance = modifierHypernymDistance;
this.headHypernymDistance = headHypernymDistance;
}
public CompoundDBEntry(String compound, String modifier, String head, int compoundId, String comment) {
this.compound = compound;
this.modifier = modifier;
this.head = head;
this.compoundId = compoundId;
this.comment = comment;
}
public CompoundDBEntry(String modifier, String head, int compoundId,
String comment, GermaNet germaNet) {
this.modifier = modifier;
this.head = head;
this.compoundId = compoundId;
this.comment = comment;
this.modifierIsInGermaNet = true;
this.headIsInGermaNet = true;
this.compound = germaNet.getLexUnitByID(compoundId).getOrthForm();
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
DisambiguatedHypernym hypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(compoundId), modifier, germaNet);
if (hypernym == null) {
lexUnits = germaNet.getLexUnits(modifier);
if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
} else if (lexUnits.isEmpty()) {
this.modifierIsInGermaNet = false;
}
} else {
modifierId = hypernym.getHypernym().getId();
modifierHypernymDistance = hypernym.getDistance();
}
hypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(compoundId), head, germaNet);
if (hypernym == null) {
lexUnits = germaNet.getLexUnits(head);
if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
} else if (lexUnits.isEmpty()) {
this.headIsInGermaNet = false;
}
} else {
headId = hypernym.getHypernym().getId();
headHypernymDistance = hypernym.getDistance();
}
}
public Integer getCompoundId() {
return this.compoundId;
}
/**
* @return the modifier
*/
public String getModifier() {
return this.modifier;
}
/**
* @param modifier the modifier to set
*/
public void setModifier(String modifier) {
this.modifier = modifier;
}
/**
* @return the head
*/
public String getHead() {
return this.head;
}
/**
* @param head the head to set
*/
public void setHead(String head) {
this.head = head;
}
/**
* @return the modifierId
*/
public int getModifierId() {
return modifierId;
}
/**
* @param modifierId the modifierId to set
*/
public void setModifierId(int modifierId) {
this.modifierId = modifierId;
}
/**
* @return the headId
*/
public int getHeadId() {
return this.headId;
}
/**
* @param headId the headId to set
*/
public void setHeadId(int headId) {
this.headId = headId;
}
/**
* @return the isInGermaNet
*/
public boolean isModifierInGermaNet() {
return modifierIsInGermaNet;
}
/**
* @param isModifierInGermaNet the isInGermaNet to set
*/
public void setModifierIsInGermaNet(boolean modifierIsInGermaNet) {
this.modifierIsInGermaNet = modifierIsInGermaNet;
}
public boolean isHeadInGermaNet() {
return headIsInGermaNet;
}
public void setHeadIsInGermaNet(boolean headIsInGermaNet) {
this.headIsInGermaNet = headIsInGermaNet;
}
public int getModifierHypernymDistance() {
return modifierHypernymDistance;
}
public int getHeadHypernymDistance() {
return headHypernymDistance;
}
/**
* @return the comment
*/
public String getComment() {
return this.comment;
}
/**
* @param comment the comment to set
*/
public void setComment(String comment) {
this.comment = comment;
}
public boolean equals(CompoundDBEntry compoundDBEntry) {
return ((this.modifierIsInGermaNet == compoundDBEntry.isModifierInGermaNet())
&& (this.headIsInGermaNet == compoundDBEntry.isHeadInGermaNet())
&& (this.modifier.equals(compoundDBEntry.getModifier()))
&& (this.modifierId == compoundDBEntry.getModifierId())
&& (this.head.equals(compoundDBEntry.getHead()))
&& (this.headId == compoundDBEntry.getHeadId()));
}
public String toSQLString() {
String sqlString = compound + "\t" + compoundId + "\t"
+ this.modifier + "\t\t\t"
+ this.head + "\t\t\t"
+ this.comment;
return sqlString;
}
public String toSmallSQLString() {
String sqlString = "";
if (this.modifier.equals("")) {
sqlString += "\\N\t";
} else {
sqlString += this.modifier + "\t";
}
if (this.head.equals("")) {
sqlString += "\\N\t";
} else {
sqlString += this.head + "\t";
}
if (this.comment.equals("")) {
sqlString += "\\N";
} else {
sqlString += this.comment;
}
return sqlString;
}
}
package decompounder;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
/**
*
* @author Verena Henrich
*/
public class CompoundDeterminer {
private final Connection connection;
private static final String username = "germanet";
private static final String password = "germanet";
private final GermaNet germaNetCaseInsensitive;
private final GermaNet germaNet;
// public static String databaseUrl = "jdbc:postgresql://localhost:5432/germanet";
public static String databaseUrl = "jdbc:postgresql://localhost:5432/germanet13a";
private String compoundsWithoutHyphensFinalFileName = "compounds_without_hyphens_";
private String compoundsWithoutHyphensToReviewFileName = "compounds_without_hyphens_to_review";
private String fileEnding = ".txt";
private String nonCompoundsFinalFile = "non_compounds";
private String nonCompoundsToReviewFile = "non_compounds_to_review";
public static int twoBinaryCompoundsAreInGN = 0;
public static String latin1Encoding = "ISO-8859-1"; // ISO-8859-1 == Latin-1
public static String utf8Encoding = "UTF-8";
private CompoundDeterminerASV compoundDeterminerASV;
private CompoundDeterminerGN compoundDeterminerGN;
private CompoundDeterminerSMOR compoundDeterminerSMOR;
public static String resultsFolder = "src/test/resources/results" + File.separatorChar;// + "2013-05-31_all_new_lexunits" + File.separatorChar;
private static final String GET_NOUNS_WITHOUT_HYPHEN_QUERY = "SELECT l.orth_form AS orth_form, l.id AS id "
+ "FROM lex_unit_table l, synset_table s WHERE l.synset_id = s.id "
+ "and s.word_category_id = 1 and length(l.orth_form) > 3 "
+ "and l.orth_form not like '% %' and l.orth_form not like '%-%' "
+ "and l.id between ? and ?"
+ " order by l.id";
public static void main(String[] args) throws IOException, Exception {
CompoundDeterminer compoundDeterminer = new CompoundDeterminer();
/******************************************
* For creating a new compounds list, *
* you need to specify the relevant range *
* of lexical units (by their IDs) *
******************************************/
int fromLexUnitId = 118719; //118719
int toLexUnitId = 118819; //125687 141904
compoundDeterminer.processCompoundsWithHyphens(fromLexUnitId, toLexUnitId);
compoundDeterminer.processNonCompoundsWithEmptySpace(fromLexUnitId, toLexUnitId);
compoundDeterminer.processCompoundsWithoutHyphens(fromLexUnitId, toLexUnitId);
}
public CompoundDeterminer() throws Exception {
germaNetCaseInsensitive = new GermaNet(username, password, databaseUrl, true);
germaNet = new GermaNet(username, password, databaseUrl, false);
try {
this.connection = DriverManager.getConnection(databaseUrl, username, password);
} catch (SQLException ex) {
throw new Exception(ex);
}
compoundDeterminerASV = new CompoundDeterminerASV(germaNet);
compoundDeterminerGN = new CompoundDeterminerGN(germaNet, connection);
compoundDeterminerSMOR = new CompoundDeterminerSMOR(germaNet, connection);
}
public static void writeFile(String fileName, String content, boolean append) throws IOException {
writeFile(fileName, content, append, utf8Encoding);
}
public static void writeFile(String fileName, String content, boolean append, String encoding) throws IOException {
// this would produce temporary "in-between" files that are never needed
boolean writeTmpFiles = false;
if (!writeTmpFiles &&
(fileName.contains("compounds_determined_with_") || fileName.contains("_to_review_"))) {
return;
}
try {
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
new File(resultsFolder + fileName), append), encoding));
writer.write(content);
writer.close();
} catch (Exception ex) {
System.err.println(ex.getMessage());
}
}
private Map<Integer, String> getNouns(int fromLexUnitId, int toLexUnitId) throws SQLException {
Map<Integer, String> nouns = new LinkedHashMap<Integer, String>();
PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITHOUT_HYPHEN_QUERY);
findNounsStatement.setInt(1, fromLexUnitId);
findNounsStatement.setInt(2, toLexUnitId);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
// System.out.println(results.getInt("id") + " " + results.getString("orth_form"));
nouns.put(results.getInt("id"), results.getString("orth_form"));
}
results.close();
return nouns;
}
private void processCompoundsWithoutHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException {
// while (//fromLexUnitId < germaNet.getNumLexUnits() &&
// toLexUnitId < 125688) {
Map<Integer, String> nouns = getNouns(fromLexUnitId, toLexUnitId);
fileEnding = "_from_" + fromLexUnitId + "_to_" + toLexUnitId + ".txt";
System.out.println("fileEnding=" + fileEnding);
// writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, CompoundDBEntry.COPY_INTO_QUERY, false);
writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, "", false);
// writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, CompoundDBEntry.COPY_INTO_QUERY, false);
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "", false);
writeFile(nonCompoundsFinalFile + fileEnding, "", false);
writeFile(nonCompoundsToReviewFile + fileEnding, "", false);
compoundDeterminerSMOR.setFileEnding(fileEnding);
compoundDeterminerGN.setFileEnding(fileEnding);
compoundDeterminerASV.setFileEnding(fileEnding);
// go through all extracted nouns
Iterator it = nouns.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Integer, String> entry = (Map.Entry)it.next();
String noun = entry.getValue();
int id = entry.getKey();
CompoundDBEntry smorCompound = compoundDeterminerSMOR.determineCompoundWithSMOR(id, noun);
CompoundDBEntry gnCompound = compoundDeterminerGN.determineCompoundWithGermaNet(id, noun);
CompoundDBEntry asvCompound = compoundDeterminerASV.determineCompoundWithASV(id, noun);
CompoundDBEntry smorCompoundClone = new CompoundDBEntry(noun, smorCompound.getModifier(), smorCompound.getHead(), id, smorCompound.getComment());
CompoundDBEntry gnCompoundClone = new CompoundDBEntry(noun, gnCompound.getModifier(), gnCompound.getHead(), id, gnCompound.getComment());
CompoundDBEntry asvCompoundClone = new CompoundDBEntry(noun, asvCompound.getModifier(), asvCompound.getHead(), id, asvCompound.getComment());
// System.out.println(smorCompound.toSQLString());
// System.out.println(gnCompound.toSQLString());
// System.out.println(asvCompound.toSQLString());
HashMap<String, String> changeModifiers = new HashMap<String, String>();
changeModifiers.put("Neben", "neben");
changeModifiers.put("Gegen", "gegen");
changeModifiers.put("Haupt", "haupt");
changeModifiers.put("General", "general");
changeModifiers.put("Rück", "rück");
changeModifiers.put("Elektro", "elektro");
changeModifiers.put("Ober", "ober");
changeModifiers.put("Spitz", "spitz");
changeModifiers.put("Mini", "mini");
changeModifiers.put("Sonder", "sonder");
changeModifiers.put("Brutto", "brutto");
changeModifiers.put("Netto", "netto");
changeModifiers.put("Bio", "bio");
changeModifiers.put("Über", "über");
changeModifiers.put("Audio", "audio");
changeModifiers.put("Midi", "midi");
changeModifiers.put("Lokal", "lokal");
changeModifiers.put("Mikro", "mikro");
changeModifiers.put("Makro", "makro");
changeModifiers.put("Mittel", "mittel");
changeModifiers.put("Meta", "meta");
changeModifiers.put("Tief", "tief");
changeModifiers.put("Zwischen", "zwischen");
changeModifiers.put("Zweit", "zweit");
changeModifiers.put("einzel", "einzeln");
changeModifiers.put("Einzel", "einzeln");
changeModifiers.put("doppel", "doppelt");
changeModifiers.put("Doppel", "doppelt");
changeModifiers.put("erst", "erste");
changeModifiers.put("Putz", "putzen");
changeModifiers.put("Spann", "spannen");
changeModifiers.put("Warte", "warten");
changeModifiers.put("Fernseh", "Fernsehen");
if (changeModifiers.containsKey(smorCompound.getModifier())) {
smorCompound.setModifier(changeModifiers.get(smorCompound.getModifier()));
}
if (changeModifiers.containsKey(asvCompound.getModifier())) {
asvCompound.setModifier(changeModifiers.get(asvCompound.getModifier()));
}
if (changeModifiers.containsKey(gnCompound.getModifier())) {
gnCompound.setModifier(changeModifiers.get(gnCompound.getModifier()));
}
if (smorCompound.getHead().matches("[a-zäöü].*")) {
// System.out.println(noun + " " + id + ": smor-head small case: " + smorCompound.getHead());
smorCompound.setHead("");
} else if (!noun.endsWith(smorCompound.getHead().toLowerCase())) {
// System.out.println(noun + " " + id + ": smor-head does not equal noun: " + smorCompound.getHead());
smorCompound.setHead("");
}
if (gnCompound.getHead().matches("[a-zäöü].*")) {
// System.out.println(noun + " " + id + ": gn-head small case: " + gnCompound.getHead());
gnCompound.setHead("");
} else if (!noun.endsWith(gnCompound.getHead().toLowerCase())) {
// System.out.println(noun + " " + id + ": gn-head does not equal noun: " + gnCompound.getHead());
gnCompound.setHead("");
}
if (asvCompound.getHead().matches("[a-zäöü].*")) {
// System.out.println(noun + " " + id + ": asv-head small case: " + asvCompound.getHead());
asvCompound.setHead("");
} else if (!noun.endsWith(asvCompound.getHead().toLowerCase())) {
// System.out.println(noun + " " + id + ": asv-head does not equal noun: " + asvCompound.getHead());
asvCompound.setHead("");
}
if (noun.endsWith("machung")) {
// System.out.println(noun + ": -machung");
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'machung\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
} else if (noun.endsWith("igkeit")
&& !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("igkeit"))).isEmpty()) {
// System.out.println(noun + ": existing word + igkeit");
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'igkeit\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
// return new CompoundDBEntry("", "", "", id, "GN: existing word + \'igkeit\'");
} else if (noun.endsWith("keit")
&& !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("keit"))).isEmpty()) {
// System.out.println(noun + ": existing word + keit");
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-keit\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
// return new CompoundDBEntry("", "", "", id, "GN: existing word + \'keit\'");
} else if (noun.endsWith("keit")
&& !noun.endsWith("fertigkeit")
&& !noun.endsWith("fähigkeit")
&& !noun.endsWith("tätigkeit")
&& !noun.endsWith("geschwindigkeit")
&& !noun.endsWith("wahrscheinlichkeit")
&& !noun.endsWith("persinlichkeit")
&& !noun.endsWith("möglichkeit")) {
// System.out.println(noun + ": -keit");
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-keit\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
// return new CompoundDBEntry("", "", "", id, "GN: ends with \'keit\'");
} else if (noun.endsWith("heit")
&& !germaNetCaseInsensitive.getLexUnits(noun.substring(0, noun.lastIndexOf("heit"))).isEmpty()) {
// System.out.println(noun + ": existing word + heit");
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tnoun ends with \'-heit\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
// return new CompoundDBEntry("", "", "", id, "GN: existing word + \'heit\'");
} else if (smorCompound.getHead().equals("") && gnCompound.getHead().equals("") && asvCompound.getHead().equals("")) {
if (noun.endsWith("keit") || noun.endsWith("heit") || noun.endsWith("ität") || noun.endsWith("ung")
|| noun.endsWith("tum") || noun.endsWith("schaft") || noun.endsWith("tion")) {
writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null; noun ends with \'-keit\'/\'-heit\'/\'-ität\'/\'-ung\'/\'-tum\'/\'-schaft\'/\'-tion\'\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!gnCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.getComment() + "\n", true);
}
if (!asvCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true);
}
writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true);
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; noun ends with \'-keit\'/\'-heit\'; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
} else if ((!smorCompound.getComment().equals("") && !gnCompound.getComment().equals(""))
|| (!asvCompound.getComment().equals("") && !gnCompound.getComment().equals(""))
|| !asvCompound.getComment().equals("")
|| !smorCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!gnCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.getComment() + "\n", true);
}
if (!asvCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.getComment() + "\n", true);
}
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true);
// writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true);
writeFile(compoundsWithoutHyphensFinalFileName + fileEnding,
noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; Compound induced, but not splitted\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
} else {
writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR, GN and ASV are null\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!gnCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.getComment() + "\n", true);
}
if (!asvCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true);
}
writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true);
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR, GN and ASV are null; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
}
} else if (smorCompound.getHead().equals("") && gnCompound.getHead().equals("")) {
// if (!smorCompound.getComment().equals("") || !gnCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR and GN are null\n", true);
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.toSQLString() + "\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!gnCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.getComment() + "\n", true);
}
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true);
// writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true);
writeFile(compoundsWithoutHyphensFinalFileName + fileEnding,
noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and GN are null; Compound induced, but not splitted\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
// } else {
// writeFile(nonCompoundsFile + fileEnding, id + "\t" + noun + ": SMOR and GN are null\n", true);
// writeFile(nonCompoundsFile + fileEnding, asvCompound.toSQLString() + "\n", true);
// if (!smorCompound.getComment().equals("")) {
// writeFile(nonCompoundsFile + fileEnding, smorCompound.getComment() + "\n", true);
// }
// if (!gnCompound.getComment().equals("")) {
// writeFile(nonCompoundsFile + fileEnding, gnCompound.getComment() + "\n", true);
// }
// writeFile(nonCompoundsFile + fileEnding, "\n", true);
// }
} else if (gnCompound.getHead().equals("") && asvCompound.getHead().equals("")) {
CompoundDBEntry compound = smorCompound;
compound.setComment("GN and ASV are null; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else if (smorCompound.getHead().equals("") && asvCompound.getHead().equals("")) {
if (gnCompound.isModifierInGermaNet() && gnCompound.getHeadHypernymDistance() < 5
&& gnCompound.getHeadHypernymDistance() > 0) {
CompoundDBEntry compound = gnCompound;
compound.setComment("SMOR and ASV are null; " + gnCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else if (!smorCompound.getComment().equals("") || !asvCompound.getComment().equals("")
// || gnCompound.getComment().equals("GN: head is hypernym")
// || gnCompound.getComment().equals("GN: head is part-whole related")
) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, id + "\t" + noun + ": SMOR and ASV are null\n", true);
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, gnCompound.toSQLString() + "\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!asvCompound.getComment().equals("")) {
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, asvCompound.getComment() + "\n", true);
}
writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\n", true);
// writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, noun + "\t" + id + "\n", true);
writeFile(compoundsWithoutHyphensFinalFileName + fileEnding,
noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and ASV are null; Compound induced, but not splitted\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
} else {
writeFile(nonCompoundsToReviewFile + fileEnding, id + "\t" + noun + ": SMOR and ASV are null\n", true);
writeFile(nonCompoundsToReviewFile + fileEnding, gnCompound.toSQLString() + "\n", true);
if (!smorCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, smorCompound.getComment() + "\n", true);
}
if (!asvCompound.getComment().equals("")) {
writeFile(nonCompoundsToReviewFile + fileEnding, asvCompound.getComment() + "\n", true);
}
writeFile(nonCompoundsToReviewFile + fileEnding, "\n", true);
writeFile(nonCompoundsFinalFile + fileEnding,
"np\t\t" + noun + "\t" + id + "\t\\N\t\t\t\\N\t\t\tSMOR and ASV are null; No compound induced\t"
+ smorCompoundClone.toSmallSQLString() + "\t" + gnCompoundClone.toSmallSQLString() + "\t"
+ asvCompoundClone.toSmallSQLString() + "\n", true);
}
} else if (asvCompound.getHead().equals("")) {
if (smorCompound.equals(gnCompound)) {
CompoundDBEntry compound = smorCompound;
compound.setComment("SMOR and GN agree, ASV is null; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound);
if (compound != null) {
compound.setComment("SMOR and GN do not agree, ASV is null; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = smorCompound;
compound.setComment("SMOR and GN do not agree, ASV is null; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
}
} else if (gnCompound.getHead().equals("")) {
if (smorCompound.equals(asvCompound)) {
CompoundDBEntry compound = smorCompound;
compound.setComment("SMOR and ASV agree, GN is null; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound);
if (compound != null) {
compound.setComment("SMOR and ASV do not agree, GN is null; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = smorCompound;
compound.setComment("SMOR and ASV do not agree, GN is null; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
}
} else if (smorCompound.getHead().equals("")) {
if (asvCompound.equals(gnCompound)) {
CompoundDBEntry compound = gnCompound;
compound.setComment("GN and ASV agree, SMOR is null; " + gnCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
CompoundDBEntry compound = determineIfOneCompoundOutperformsTheOther(gnCompound, asvCompound);
if (compound != null) {
compound.setComment("GN and ASV do not agree, SMOR is null; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = gnCompound;
compound.setComment("GN and ASV do not agree, SMOR is null; " + gnCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
// writeFile(nonCompoundsFile + fileEnding, id + "\t" + noun + ": GN and ASV do not agree, SMOR is null\n", true);
// writeFile(nonCompoundsFile + fileEnding, gnCompound.toSQLString(), true);
// writeFile(nonCompoundsFile + fileEnding, asvCompound.toSQLString() + "\n", true);
}
}
} else if (smorCompound.equals(gnCompound) && smorCompound.equals(asvCompound)) {
CompoundDBEntry compound = smorCompound;
compound.setComment("SMOR, GN, and ASV agree; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else if (smorCompound.equals(gnCompound) && !smorCompound.equals(asvCompound)) {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound);
if (compound != null) {
// System.out.println("1");
compound.setComment("SMOR and GN agree, ASV not; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
// System.out.println("2");
compound = smorCompound;
compound.setComment("SMOR and GN agree, ASV not; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
} else if (smorCompound.equals(asvCompound) && !smorCompound.equals(gnCompound)) {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound);
if (compound != null) {
compound.setComment("SMOR and ASV agree, GN not; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = smorCompound;
compound.setComment("SMOR and ASV agree, GN not; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
} else if (gnCompound.equals(asvCompound) && !gnCompound.equals(smorCompound)) {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, gnCompound);
if (compound != null) {
compound.setComment("GN and ASV agree, SMOR not; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = gnCompound;
compound.setComment("GN and ASV agree, SMOR not; " + gnCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
} else {
CompoundDBEntry compound = determineIfSmorOutperformsOtherCompound(smorCompound, asvCompound);
if (compound == null) {
compound = determineIfOneCompoundOutperformsTheOther(smorCompound, gnCompound);
if (compound == null) {
compound = determineIfOneCompoundOutperformsTheOther(gnCompound, asvCompound);
}
} else if (determineIfOneCompoundOutperformsTheOther(compound, gnCompound) != null) {
compound = determineIfOneCompoundOutperformsTheOther(compound, gnCompound);
}
if (compound != null) {
compound.setComment("SMOR, GN, and ASV do not agree; " + compound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
} else {
compound = smorCompound;
compound.setComment("SMOR, GN, and ASV do not agree; " + smorCompound.getComment());
writeFinalResultInCorrectFile(compound, smorCompoundClone, gnCompoundClone, asvCompoundClone);
}
}
}
// writeFile(compoundsWithoutHyphensFinalFileName + fileEnding, "\\.\n", true);
// writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, "\\.\n", true);
// fromLexUnitId = toLexUnitId + 1;
// toLexUnitId += 5000;
// }
}
private void writeFinalResultInCorrectFile(CompoundDBEntry compound, CompoundDBEntry smorCompound,
CompoundDBEntry gnCompound, CompoundDBEntry asvCompound) throws IOException {
// if (compound.getModifierId() > 0 && compound.getModifierHypernymDistance() < 0) {
// writeFile(compoundsWithoutHyphensToReviewFileName + fileEnding, compound.toSQLString(), true);
// } else {
/* if (smorComment.getComment().equals("")) {
smorComment = "\\N";
}
if (gnComment.equals("")) {
gnComment = "\\N";
}
if (asvComment.equals("")) {
asvComment = "\\N";
}*/
writeFile(compoundsWithoutHyphensFinalFileName + fileEnding,
compound.toSQLString()
// + "\t" + smorCompound.toSmallSQLString() + "\t"
// + gnCompound.toSmallSQLString() + "\t" + asvCompound.toSmallSQLString()
+ "\n", true);
// }
}
private void processCompoundsWithHyphens() throws SQLException, IOException {
compoundDeterminerGN.processCompoundsWithHyphens();
}
private void processCompoundsWithHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException {
compoundDeterminerGN.processCompoundsWithHyphens(fromLexUnitId, toLexUnitId);
}
private void processNonCompoundsWithEmptySpace() throws SQLException, IOException {
compoundDeterminerGN.processNonCompoundsWithEmptySpace();
}
private void processNonCompoundsWithEmptySpace(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException {
compoundDeterminerGN.processNonCompoundsWithEmptySpace(fromLexUnitId, toLexUnitId);
}
private CompoundDBEntry determineIfSmorOutperformsOtherCompound(CompoundDBEntry smorCompound, CompoundDBEntry otherCompound) {
String smorComment = smorCompound.getComment().substring(smorCompound.getComment().lastIndexOf(":") + 2);
// System.out.println("smorComment=" + smorComment);
if (smorCompound.getHead().equals(otherCompound.getHead()) && !smorCompound.getModifier().equals(otherCompound.getModifier())) {
smorComment = smorComment.split(" ")[0];
// System.out.println("smorComment1=" + smorComment);
if ((smorComment.contains("<V>") || smorComment.contains("<ADJ>")) && otherCompound.getHead().substring(0, 1).matches("[A-Z]")) {
return smorCompound;
}
} else if (smorCompound.getModifier().equals(otherCompound.getModifier()) && !smorCompound.getHead().equals(otherCompound.getHead())) {
smorComment = smorComment.substring(smorComment.lastIndexOf(" "));
// System.out.println("smorComment2=" + smorComment);
// System.out.println(otherCompound.getHead() + " " + otherCompound.getHead().matches("[a-z].*"));
if ((smorComment.contains("<V>") || smorComment.contains("<ADJ>"))) {
return otherCompound;
} else if (otherCompound.getHead().matches("[a-zäöü].*") && !smorCompound.getHead().matches("[a-zäöü].*")) {
return smorCompound;
} else if (!otherCompound.getHead().matches("[a-zäöü].*") && smorCompound.getHead().matches("[a-zäöü].*")) {
return otherCompound;
} else if (smorCompound.isHeadInGermaNet() && !otherCompound.isHeadInGermaNet()) {
return smorCompound;
} else if (!smorCompound.isHeadInGermaNet() && otherCompound.isHeadInGermaNet()) {
return otherCompound;
// } else if (smorCompound.getHeadId() > 0 && otherCompound.getHeadId() < 0) {
// return smorCompound;
// } else if (smorCompound.getHeadId() < 0 && otherCompound.getHeadId() > 0) {
// return otherCompound;
// } else if (smorCompound.getHeadId() < otherCompound.getHeadId()) {
// return smorCompound;
// } else if (smorCompound.getHeadId() > otherCompound.getHeadId()) {
// return otherCompound;
// } else if (smorCompound.getHeadHypernymDistance() > 0 && otherCompound.getHeadHypernymDistance() < 0) {
// return smorCompound;
// } else if (smorCompound.getHeadHypernymDistance() < 0 && otherCompound.getHeadHypernymDistance() > 0) {
// return otherCompound;
} else if (smorCompound.getHeadHypernymDistance() > otherCompound.getHeadHypernymDistance()) {
// System.out.println(germaNet.getLexUnitByID(smorCompound.getCompoundId()).getOrthForm() + " " + smorCompound.getCompoundId()
// + " has head hypernyms " + smorCompound.getHead() + " (correct) and " + otherCompound.getHead());
return smorCompound;
} else if (smorCompound.getHeadHypernymDistance() < otherCompound.getHeadHypernymDistance()) {
// System.out.println(germaNet.getLexUnitByID(smorCompound.getCompoundId()).getOrthForm() + " " + smorCompound.getCompoundId()
// + " has head hypernyms " + smorCompound.getHead() + " and " + otherCompound.getHead() + " (correct)");
return otherCompound;
} else if (smorCompound.getHead().length() > otherCompound.getHead().length()) {
return smorCompound;
} else if (smorCompound.getHead().length() < otherCompound.getHead().length()) {
return otherCompound;
}
}
return determineIfOneCompoundOutperformsTheOther(smorCompound, otherCompound);
}
private CompoundDBEntry determineIfOneCompoundOutperformsTheOther(CompoundDBEntry compound1, CompoundDBEntry compound2) {
// System.out.println(compound1.toSQLString());
// System.out.println(compound2.toSQLString());
// System.out.println(compound1.isHeadInGermaNet() + " " + compound1.isModifierInGermaNet() + " " +
// compound2.isHeadInGermaNet() + " " + compound2.isModifierInGermaNet());
if (compound1.isHeadInGermaNet() && compound1.isModifierInGermaNet()
&& !(compound2.isHeadInGermaNet() && compound2.isModifierInGermaNet())) {
// System.out.println("A");
return compound1;
} else if (!(compound1.isHeadInGermaNet() && compound1.isModifierInGermaNet())
&& compound2.isHeadInGermaNet() && compound2.isModifierInGermaNet()) {
// System.out.println("B");
return compound2;
} else if (compound1.isHeadInGermaNet() && !compound2.isHeadInGermaNet()) {
// System.out.println("C");
return compound1;
} else if (!compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) {
// System.out.println("D");
return compound2;
// } else if (compound1.getHeadHypernymDistance() > 0 && compound2.getHeadHypernymDistance() < 0) {
// return compound1;
// } else if (compound1.getHeadHypernymDistance() < 0 && compound2.getHeadHypernymDistance() > 0) {
// return compound2;
} else if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) {
// System.out.println(germaNet.getLexUnitByID(compound1.getCompoundId()).getOrthForm() + " " + compound1.getCompoundId()
// + " has head hypernyms " + compound1.getHead() + " (correct) and " + compound2.getHead());
return compound1; // this is against common sense, but e.g.:
//Müll+Verbrennungsanlage getHeadHypernymDistance: 1
//Müllverbrennung+Anlage getHeadHypernymDistance: 3 (this is correct binary splitting)
// or:
//leicht+ Metallgießerei getHeadHypernymDistance: 1
//Leichtmetall+Gießerei getHeadHypernymDistance: 2 (this is correct binary splitting)
// this also explains, why we do not need to check if one head-hypernym is not -1
} else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) {
// System.out.println(germaNet.getLexUnitByID(compound1.getCompoundId()).getOrthForm() + " " + compound1.getCompoundId()
// + " has head hypernyms " + compound1.getHead() + " and " + compound2.getHead() + " (correct)");
return compound2; // this is against common sense, see above
} else if (compound1.getModifierHypernymDistance() > 0 && compound2.getModifierHypernymDistance() < 0) {
// System.out.println("E");
return compound1;
} else if (compound1.getModifierHypernymDistance() < 0 && compound2.getModifierHypernymDistance() > 0) {
// System.out.println("F");
return compound2;
} else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) {
// System.out.println("G");
return compound2; // the same that accounts for head-hypernyms (see above) might also account for modifier-hypernyms.
// here, this is not verified, but done intuitively (not the same than for head-hypernym).
} else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) {
// System.out.println("H");
return compound2;
} else if (compound1.isHeadInGermaNet() && !compound2.isHeadInGermaNet()) {
// System.out.println("head of compound1 (" + compound1.getHead()
// + ") is in GermaNet, but head of compound2 (" + compound2.getHead() + ") not)");
return compound1;
} else if (!compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) {
// System.out.println("head of compound2 (" + compound2.getHead()
// + ") is in GermaNet, but head of compound1 (" + compound1.getHead() + ") not)");
return compound2;
} else if (compound1.isModifierInGermaNet() && !compound2.isModifierInGermaNet()) {
// System.out.println("modifier of compound1 (" + compound1.getModifier()
// + ") is in GermaNet, but modifier of compound2 (" + compound2.getModifier() + ") not)");
return compound1;
} else if (!compound1.isModifierInGermaNet() && compound2.isModifierInGermaNet()) {
// System.out.println("modifier of compound2 (" + compound2.getModifier()
// + ") is in GermaNet, but modifier of compound1 (" + compound1.getModifier() + ") not)");
return compound2;
} else {
// System.out.println("K");
return null;
}
}
}
package decompounder;
import de.uni_leipzig.asv.toolbox.baseforms.Zerleger2;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDeterminerASV {
private Zerleger2 zerleger = new Zerleger2();
private GermaNet germaNet;
// private String logFile = "log.txt";
private String compoundsDeterminedWithASVFile = "compounds_determined_with_asv";
private String nonCompoundsDeterminedWithASVFile = "non_compounds_determined_with_asv";
private String fileEnding = "";
public CompoundDeterminerASV(GermaNet germaNet) throws IOException {
this.germaNet = germaNet;
String red = "src/main/resources/ASV_trees/grfExt.tree"; // reduce file for splitting
String forw = "src/main/resources/ASV_trees/kompVVic.tree"; // forward file
String back = "src/main/resources/ASV_trees/kompVHic.tree"; // backward file
zerleger = new Zerleger2(); // splitter
zerleger.init(forw, back, red);
}
public CompoundDBEntry determineCompoundWithASV(int id, String noun) throws IOException {
List<String> splitted = zerleger.kZerlegung(noun);
// System.out.println(noun + " " + splitted);
if (splitted.contains(";")) {
// System.err.println(noun + " contains \";\" " + splitted);
}
while (splitted.contains("")) {
// System.err.println(noun + " contains empty entry " + splitted);
splitted.remove("");
}
if (splitted.size() > 1 && splitted.get(splitted.size()-1).equalsIgnoreCase("schaft")) {
splitted.remove(splitted.size()-1);
splitted.set(splitted.size()-1, splitted.get(splitted.size()-1) + "schaft");
// System.out.println(noun + " splitted without 'schaft'");
}
if (splitted.size() > 2
&& splitted.get(splitted.size()-1).equalsIgnoreCase("werk")
&& splitted.get(splitted.size()-2).equalsIgnoreCase("bau")) {
splitted.remove(splitted.size()-1);
splitted.set(splitted.size()-1, "bauwerk");
// System.out.println(noun + " splitted without 'schaft'");
}
if (splitted.size() < 2) { // if noun is not a compound
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": is no compound\n", true);
} else if (splitted.size() == 2) { // if noun is a binary compound
// System.out.println(noun + "\t" + splitted);
// compound as proposed by compound splitter
String modifier = splitted.get(0);
String head = splitted.get(1);
if (modifier.contains(";") && head.contains(";")) {
// System.err.println(noun + ": modifier and noun contain \";\"");
}
if (modifier.contains(";")) {
// System.out.println(modifier + "-->" + noun.substring(0, noun.length() - head.length()));
modifier = noun.substring(0, noun.length() - head.length());
}
if (head.contains(";")) {
// System.out.println(head + "-->" + noun.substring(modifier.length()));
head = noun.substring(modifier.length() + 1);
}
CompoundDBEntry compound = createCompoundDBEntry(modifier, head, id);
if (compound.isModifierInGermaNet() && compound.isHeadInGermaNet()) {
compound.setComment("ASV: 2 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true);
return compound;
} else if (compound.isHeadInGermaNet()) {
compound.setComment("ASV: 2 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true);
return compound;
}
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " is not correctly splitted: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: is not correctly splitted: " + splitted);
} else if (splitted.size() == 3) { // if noun consists of 3 constituents
// System.out.println(noun + "\t" + splitted);
// modifier is first part by compound splitter,
// head starts with second part by compound splitter
String modifier = splitted.get(0);
// System.out.print(" modifier=" + modifier);
String constituent2_withoutLastChar = splitted.get(1).substring(0, splitted.get(1).length()-1);
// System.out.print(", c=" + constituent2_withoutLastChar);
String head = noun.substring(modifier.length());
if (noun.indexOf(constituent2_withoutLastChar, modifier.length()-1) != -1) {
head = noun.substring(noun.indexOf(constituent2_withoutLastChar, modifier.length()-1));
}
CompoundDBEntry compound1 = createCompoundDBEntry(modifier, head, id);
// System.out.println(", head=" + head);
head = splitted.get(2);
if (noun.indexOf(splitted.get(1)) != -1) {
modifier = noun.substring(0, noun.indexOf(splitted.get(1)) + splitted.get(1).length());
} else {
modifier = noun.substring(0, noun.length() - head.length());
}
// System.out.println(splitted + " modifier2=" + modifier + ", head2=" + head);
CompoundDBEntry compound2 = createCompoundDBEntry(modifier, head, id);
// CompoundDeterminer.writeFile(logFile, "2: " + compound2.getModifier() + " " + compound2.getHypernym() +
// " (isInGN=" + compound2.isModifierInGermaNet()
// + ", headIsInGN=" + compound2.isHeadInGermaNet() + ")\n", true);
if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet()
&& compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) {
if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) {
compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 > headDistance2)");
return compound1;
} else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) {
compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 < headDistance2)");
return compound2;
} else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) {
compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 > modifierDistance2)");
return compound1;
} else if (compound1.getModifierHypernymDistance() < compound2.getModifierHypernymDistance()) {
compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 < modifierDistance2)");
return compound2;
}
CompoundDeterminer.twoBinaryCompoundsAreInGN++;
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds are in GN\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
} else if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet()) { // ist dieses IF doppelt?
compound1.setComment("ASV: 3 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
return compound1;
} else if (compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) {
compound2.setComment("ASV: 3 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
return compound2;
} else if (compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) {
if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) {
return compound1;
} else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) {
return compound2;
}
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds whose heads are in GN\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
} else if (compound1.isHeadInGermaNet()) {
compound1.setComment("ASV: 3 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
return compound1;
} else if (compound2.isHeadInGermaNet()) {
compound2.setComment("ASV: 3 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
return compound2;
} else {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " not correctly splitted: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: not correctly splitted: " + splitted);
}
} else if (splitted.size() > 3) {
// System.out.println(noun + "\t" + splitted);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " has more than 3 constituents: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: has more than 3 constituents: " + splitted);
}
return new CompoundDBEntry("", "", "", id, "");
}
private CompoundDBEntry createCompoundDBEntry(String modifier, String head, int compoundId) {
int modifierId = -1;
int headId = -1;
String comment = "ASV";
boolean modifierIsInGermaNet = true;
boolean headIsInGermaNet = true;
int modifierHypernymDistance = -1;
int headHypernymDistance = -1;
String compositionalType = "\\N";
LexUnit compound = germaNet.getLexUnitByID(compoundId);
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet);
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier);
if (lexUnits.isEmpty()) {
// try adding modifier-e
modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier + "e", germaNet);
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier + "e");
}
if (modifierHypernym != null || lexUnits.size() > 0) {
modifier += "e";
compositionalType = CompoundDBEntry.COMPOSITIONAL_TYPE_MODIFIER_E;
} else {
// try first letter of modifier in lower case
modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier.substring(0, 1).toLowerCase() + modifier.substring(1), germaNet);
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier.substring(0, 1).toLowerCase() + modifier.substring(1));
}
if (modifierHypernym != null || lexUnits.size() > 0) {
modifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1);
}
}
}
}
if (modifierHypernym == null && lexUnits.isEmpty()) {
modifierIsInGermaNet = false;
} else if (modifierHypernym != null) {
modifierId = modifierHypernym.getHypernym().getId();
modifierHypernymDistance = modifierHypernym.getDistance();
} else if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
}
// try first character of head in upper case
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound,
head.substring(0, 1).toUpperCase() + head.substring(1), germaNet);
lexUnits = new ArrayList<LexUnit>();
if (headHypernym == null) {
lexUnits = germaNet.getLexUnits(head.substring(0, 1).toUpperCase() + head.substring(1));
if (lexUnits.isEmpty()) {
// try as it was
headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet);
if (headHypernym == null) {
lexUnits = germaNet.getLexUnits(head);
}
} else {
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
} else {
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
if (headHypernym == null && lexUnits.isEmpty()) {
headIsInGermaNet = false;
}
if (headHypernym != null) {
headId = headHypernym.getHypernym().getId();
headHypernymDistance = headHypernym.getDistance();
} else if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
}
return new CompoundDBEntry(compound.getOrthForm(), modifier, modifierId,
head, headId, compoundId, comment, modifierIsInGermaNet, headIsInGermaNet,
modifierHypernymDistance, headHypernymDistance);
}
public void setFileEnding(String fileEnding) throws IOException {
this.fileEnding = fileEnding;
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, "", false);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, "", false);
}
}
package decompounder;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDeterminerGN {
private GermaNet germaNet;
private final Connection connection;
private String compoundsWithHyphensFile = "compounds_with_hyphens###.txt";
private String nonCompoundsWithEmptySpaceFile = "non_compounds_with_empty_space###.txt";
private String compoundsDeterminedWithGNFile = "compounds_determined_with_gn";
private String nonCompoundsDeterminedWithGNFile = "non_compounds_determined_with_gn";
private String fileEnding = "";
public CompoundDeterminerGN(GermaNet germaNet, Connection connection) throws IOException {
this.germaNet = germaNet;
this.connection = connection;
// CompoundDeterminer.writeFile(compoundsWithHyphensFile, query, false);
}
private static final String GET_NOUNS_WITH_HYPHENS_QUERY = "select comp.id, comp.orth_form "
+ "from lex_unit_table comp, synset_table s "
+ "where comp.orth_form similar to '%_-_%' "
+ "and comp.orth_form not like '% %' "
+ "and s.id = comp.synset_id "
+ "and s.word_category_id = 1";
private static final String GET_NOUNS_WITH_HYPHENS_FROM_TO_QUERY = "select comp.id, comp.orth_form "
+ "from lex_unit_table comp, synset_table s "
+ "where comp.orth_form similar to '%_-_%' "
+ "and comp.orth_form not like '% %' "
+ "and s.id = comp.synset_id "
+ "and comp.id between ? and ? "
+ "and s.word_category_id = 1";
private static final String GET_NOUNS_WITH_EMPTY_SPACE_QUERY = "select comp.id, comp.orth_form "
+ "from lex_unit_table comp, synset_table s "
+ "where comp.orth_form similar to '% %' "
// + "and comp.orth_form not like '% %' "
+ "and s.id = comp.synset_id "
+ "and s.word_category_id = 1 "
+ "ORDER BY comp.id";
private static final String GET_NOUNS_WITH_EMPTY_SPACE_FROM_TO_QUERY = "select comp.id, comp.orth_form "
+ "from lex_unit_table comp, synset_table s "
+ "where comp.orth_form similar to '% %' "
// + "and comp.orth_form not like '% %' "
+ "and s.id = comp.synset_id "
+ "and comp.id between ? and ? "
+ "and s.word_category_id = 1 "
+ "ORDER BY comp.id";
private static final String GET_POTENTIAL_MODIFIERS_QUERY = "select distinct l_modifier.orth_form "
+ "from lex_unit_table l_modifier "
+ "where length(l_modifier.orth_form) > 1 "
+ "and (? like lower(l_modifier.orth_form) || '%__' "
+ "or (? like substring(l_modifier.orth_form from 0 for length(l_modifier.orth_form)) || '__%' "
+ "and substring(l_modifier.orth_form from length(l_modifier.orth_form)) like 'e')) "
+ "and substring(l_modifier.orth_form from 2) not similar to '[A-Z]%'";
private static final String GET_POTENTIAL_HEADS_QUERY = "select distinct l_head.orth_form "
+ "from lex_unit_table l_head "
+ "where length(l_head.orth_form) > 2 "
// + "and l_head.orth_form not like 'in' "
// + "and l_head.orth_form not like 'Ei' "
// + "and l_head.orth_form not like 'Rei' "
// + "and l_head.orth_form not like 'Re' "
+ "and ? like '__%' || lower(l_head.orth_form) "
+ "and substring(l_head.orth_form from 1 for 1) similar to '[A-ZÄÖÜ]' "
+ "and substring(l_head.orth_form from 2) not similar to '[A-ZÄÖÜ]%'";
public void processNonCompoundsWithEmptySpace() throws SQLException, IOException {
CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", ""), "", false);
PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_EMPTY_SPACE_QUERY);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
int id = results.getInt("id");
// LexUnit noun = germaNet.getLexUnitByID(id);
String orthForm = results.getString("orth_form");
CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", ""),
"np\t\t" + orthForm + "\t" + id + "\t\\N\t\t\t\\N\t\t\tGN: orth form with empty space (i.e., no compound)"
+ "\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n", true);
}
results.close();
}
public void processNonCompoundsWithEmptySpace(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException {
CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), "", false);
PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_EMPTY_SPACE_FROM_TO_QUERY);
findNounsStatement.setInt(1, fromLexUnitId);
findNounsStatement.setInt(2, toLexUnitId);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
int id = results.getInt("id");
String orthForm = results.getString("orth_form");
CompoundDeterminer.writeFile(nonCompoundsWithEmptySpaceFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId),
"np\t\t" + orthForm + "\t" + id + "\t\\N\t\t\t\\N\t\t\tGN: orth form with empty space (i.e., no compound)"
+ "\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n", true);
}
results.close();
}
public void processCompoundsWithHyphens(int fromLexUnitId, int toLexUnitId) throws SQLException, IOException {
CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), "", false);
PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_HYPHENS_FROM_TO_QUERY);
findNounsStatement.setInt(1, fromLexUnitId);
findNounsStatement.setInt(2, toLexUnitId);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
int compoundId = results.getInt("id");
LexUnit compound = germaNet.getLexUnitByID(compoundId);
String compoundOrthForm = results.getString("orth_form");
String modifier = compoundOrthForm.substring(0, compoundOrthForm.lastIndexOf("-"));
String head = compoundOrthForm.substring(compoundOrthForm.lastIndexOf("-") + 1);
int modifierId = -1;
int headId = -1;
boolean modifierIsInGN = true;
boolean headIsInGN = true;
// extract modifier
List<LexUnit> lexUnits;
if (modifier.length() > 1) {
lexUnits = germaNet.getLexUnits(modifier, false);
if (lexUnits.size() < 1) {
String changedModifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1);
// System.out.println("changedModifier=" + changedModifier);
lexUnits = germaNet.getLexUnits(changedModifier, false);
if (lexUnits.size() == 1) {
modifier = changedModifier;
modifierId = lexUnits.get(0).getId();
// System.out.println("changedmodifier=" + changedModifier);
} else {
modifierIsInGN = false;
// System.out.println("modifier " + modifier + " not in GN");
}
} else if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
}
} else {
modifierIsInGN = false;
}
// extract head
lexUnits = germaNet.getLexUnits(head, false);
if (lexUnits.size() < 1) {
String changedHead = head.substring(0, 1).toLowerCase() + head.substring(1);
// System.out.println("changedhead=" + changedHead);
lexUnits = germaNet.getLexUnits(changedHead, false);
if (lexUnits.size() == 1) {
head = changedHead;
headId = lexUnits.get(0).getId();
// System.out.println("changedhead=" + changedHead);
} else {
headIsInGN = false;
// System.out.println("head " + head + " not in GN");
}
} else if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
}
int modifierHypernymDistance = -1;
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet);
if (modifierHypernym != null) {
modifierHypernymDistance = modifierHypernym.getDistance();
}
int headHypernymDistance = -1;
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet);
if (headHypernym != null) {
headHypernymDistance = headHypernym.getDistance();
}
CompoundDBEntry compoundWithHyphen = new CompoundDBEntry(compoundOrthForm, modifier, modifierId,
head, headId, compoundId, "GN: compound with hypen(s)",
modifierIsInGN, headIsInGN, modifierHypernymDistance, headHypernymDistance);
CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", "_from_" + fromLexUnitId + "_to_" + toLexUnitId), compoundWithHyphen.toSQLString() + "\n", true);
}
results.close();
}
public void processCompoundsWithHyphens() throws SQLException, IOException {
CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", ""), "", false);
PreparedStatement findNounsStatement = connection.prepareStatement(GET_NOUNS_WITH_HYPHENS_QUERY);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
int compoundId = results.getInt("id");
LexUnit compound = germaNet.getLexUnitByID(compoundId);
String compoundOrthForm = results.getString("orth_form");
String modifier = compoundOrthForm.substring(0, compoundOrthForm.lastIndexOf("-"));
String head = compoundOrthForm.substring(compoundOrthForm.lastIndexOf("-") + 1);
int modifierId = -1;
int headId = -1;
boolean modifierIsInGN = true;
boolean headIsInGN = true;
// extract modifier
List<LexUnit> lexUnits;
if (modifier.length() > 1) {
lexUnits = germaNet.getLexUnits(modifier, false);
if (lexUnits.size() < 1) {
String changedModifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1);
// System.out.println("changedModifier=" + changedModifier);
lexUnits = germaNet.getLexUnits(changedModifier, false);
if (lexUnits.size() == 1) {
modifier = changedModifier;
modifierId = lexUnits.get(0).getId();
// System.out.println("changedmodifier=" + changedModifier);
} else {
modifierIsInGN = false;
// System.out.println("modifier " + modifier + " not in GN");
}
} else if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
}
} else {
modifierIsInGN = false;
}
// extract head
lexUnits = germaNet.getLexUnits(head, false);
if (lexUnits.size() < 1) {
String changedHead = head.substring(0, 1).toLowerCase() + head.substring(1);
// System.out.println("changedhead=" + changedHead);
lexUnits = germaNet.getLexUnits(changedHead, false);
if (lexUnits.size() == 1) {
head = changedHead;
headId = lexUnits.get(0).getId();
// System.out.println("changedhead=" + changedHead);
} else {
headIsInGN = false;
// System.out.println("head " + head + " not in GN");
}
} else if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
}
int modifierHypernymDistance = -1;
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet);
if (modifierHypernym != null) {
modifierHypernymDistance = modifierHypernym.getDistance();
}
int headHypernymDistance = -1;
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet);
if (headHypernym != null) {
headHypernymDistance = headHypernym.getDistance();
}
CompoundDBEntry compoundWithHyphen = new CompoundDBEntry(compoundOrthForm, modifier, modifierId,
head, headId, compoundId, "GN: compound with hypen(s)",
modifierIsInGN, headIsInGN, modifierHypernymDistance, headHypernymDistance);
CompoundDeterminer.writeFile(compoundsWithHyphensFile.replace("###", ""), compoundWithHyphen.toSQLString() + "\n", true);
}
results.close();
}
public CompoundDBEntry determineCompoundWithGermaNet(int id, String noun) throws IOException, SQLException {
if (noun.endsWith("keit")) {
// System.out.println(noun + ": ends with keit - " + noun.substring(0, noun.lastIndexOf("keit")));
if (noun.endsWith("igkeit")
&& !germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("igkeit"))).isEmpty()) {
// System.out.println(noun + ": existing word + igkeit");
return new CompoundDBEntry("", "", "", id, "GN: existing word + \'igkeit\'");
} else if (!germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("keit"))).isEmpty()) {
// System.out.println(noun + ": existing word + keit");
return new CompoundDBEntry("", "", "", id, "GN: existing word + \'keit\'");
} else if (!noun.endsWith("fertigkeit")
&& !noun.endsWith("fähigkeit")
&& !noun.endsWith("möglichkeit")) {
// System.out.println(noun + ": keit");
return new CompoundDBEntry("", "", "", id, "GN: ends with \'keit\'");
}
}
if (noun.endsWith("heit")) {
// System.out.println(noun + ": ends with heit - " + noun.substring(0, noun.lastIndexOf("heit")));
if (!germaNet.getLexUnits(noun.substring(0, noun.lastIndexOf("heit"))).isEmpty()) {
// System.out.println(noun + ": existing word + heit");
return new CompoundDBEntry("", "", "", id, "GN: existing word + \'heit\'");
}
}
List<String> modifiers = new ArrayList<String>();
PreparedStatement findStatement = connection.prepareStatement(GET_POTENTIAL_MODIFIERS_QUERY);
findStatement.setString(1, noun.toLowerCase());
findStatement.setString(2, noun.toLowerCase());
ResultSet results = findStatement.executeQuery();
while (results.next()) {
modifiers.add(results.getString("orth_form"));
}
results.close();
// System.out.print(noun + ":");
List<String> heads = new ArrayList<String>();
findStatement = connection.prepareStatement(GET_POTENTIAL_HEADS_QUERY);
findStatement.setString(1, noun.toLowerCase());
results = findStatement.executeQuery();
while (results.next()) {
heads.add(results.getString("orth_form"));
// System.out.print(" " + results.getString("orth_form"));
}
results.close();
// remove head "Werk" in case the noun ends with "Bauwerk"
if (heads.contains("Bauwerk") && heads.contains("Werk")) {
heads.remove("Werk");
}
// System.out.println("\n" + noun);
List<String[]> potentialCompounds_completeMatch = new ArrayList<String[]>();
List<String[]> potentialCompounds_completeMatchSmallModifier = new ArrayList<String[]>();
List<String[]> potentialCompounds_withModifierE = new ArrayList<String[]>();
List<String[]> potentialCompounds_withInterfix = new ArrayList<String[]>();
// System.out.println("modifiers=" + modifiers);
// System.out.println("heads=" + heads);
LexUnit compound = germaNet.getLexUnitByID(id);
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, heads, germaNet);
LexUnit headSynonym = CompoundDisambiguator.disambiguateSynonym(compound, heads, germaNet);
if (headSynonym != null) {
// System.out.println(noun + " " + id + ": headSynonym=" + headSynonym.getOrthForm());
}
LexUnit headRelated = CompoundDisambiguator.disambiguateRelation(compound, heads, germaNet);
if (headRelated != null) {
// System.out.println(noun + " " + id + ": headRelated=" + headRelated.getOrthForm());
}
LexUnit headPwRelated = CompoundDisambiguator.disambiguatePWRelation(compound, heads, germaNet);
if (headPwRelated != null) {
// System.out.println(noun + " " + id + ": headPwRelated=" + headPwRelated.getOrthForm());
} else if (headRelated != null) {
headPwRelated = headRelated;
}
List<String> headsToRemove = new ArrayList<String>();
headsToRemove.add("Schaft");
headsToRemove.add("Esse");
headsToRemove.add("Ion");
headsToRemove.add("Sal");
headsToRemove.add("Eid");
headsToRemove.add("Bel");
headsToRemove.add("Ade");
headsToRemove.add("Ale");
headsToRemove.add("Fon");
headsToRemove.add("Ren");
headsToRemove.add("Elle");
headsToRemove.add("Max");
for (String headToRemove : headsToRemove) {
if (heads.contains(headToRemove)
&& !((headHypernym != null && !headHypernym.getHypernym().getOrthForm().equals(headToRemove))
|| (headSynonym != null && !headSynonym.getOrthForm().equals(headToRemove))
|| (headRelated != null && !headRelated.getOrthForm().equals(headToRemove))
|| (headPwRelated != null && !headPwRelated.getOrthForm().equals(headToRemove)))) {
// System.out.println(noun + " " + id + ": remove head \"" + headToRemove + "\"");
heads.remove(headToRemove);
}
}
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifiers, germaNet);
LexUnit modifierSynonym = CompoundDisambiguator.disambiguateSynonym(compound, modifiers, germaNet);
if (modifierSynonym != null) {
// System.out.println(noun + " " + id + ": modifierSynonym=" + modifierSynonym.getOrthForm());
}
LexUnit modifierRelated = CompoundDisambiguator.disambiguateRelation(compound, modifiers, germaNet);
if (modifierRelated != null) {
// System.out.println(noun + " " + id + ": modifierRelated=" + modifierRelated.getOrthForm());
}
LexUnit modifierPwRelated = CompoundDisambiguator.disambiguatePWRelation(compound, modifiers, germaNet);
if (modifierPwRelated != null) {
// System.out.println(noun + " " + id + ": modifierPwRelated=" + modifierPwRelated.getOrthForm());
} else if (modifierRelated != null) {
modifierPwRelated = modifierRelated;
}
List<String> modifiersToRemove = new ArrayList<String>();
// modifiersToRemove.add("Ei");
modifiersToRemove.add("Inn");
modifiersToRemove.add("Rei");
modifiersToRemove.add("Aus");
modifiersToRemove.add("Au");
modifiersToRemove.add("Ge");
modifiersToRemove.add("in");
modifiersToRemove.add("Ga");
modifiersToRemove.add("Re");
for (String modifierToRemove : modifiersToRemove) {
if (modifiers.contains(modifierToRemove)
&& !((modifierHypernym != null && modifierHypernym.getHypernym().getOrthForm().equals(modifierToRemove))
|| (modifierSynonym != null && modifierSynonym.getOrthForm().equals(modifierToRemove))
|| (modifierRelated != null && modifierRelated.getOrthForm().equals(modifierToRemove))
|| (modifierPwRelated != null && modifierPwRelated.getOrthForm().equals(modifierToRemove)))) {
// System.out.println(noun + " " + id + ": remove modifier \"" + modifierToRemove + "\"");
modifiers.remove(modifierToRemove);
}
}
if (modifiers.contains("Weh")
&& !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Weh"))
|| (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Weh"))
|| (modifierRelated != null && !modifierRelated.getOrthForm().equals("Weh"))
|| (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Weh")))
&& (modifiers.contains("Wehr"))) {
// System.out.println(noun + " " + id + ": remove modifier \"Weh\"");
modifiers.remove("Weh");
}
if (modifiers.contains("Tage")
&& !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Tage"))
|| (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Tage"))
|| (modifierRelated != null && !modifierRelated.getOrthForm().equals("Tage"))
|| (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Tage")))
&& (modifiers.contains("Tag"))) {
// System.out.println(noun + " " + id + ": remove modifier \"Tage\"");
modifiers.remove("Tage");
}
if (modifiers.contains("Ei")
&& !((modifierHypernym != null && !modifierHypernym.getHypernym().getOrthForm().equals("Ei"))
|| (modifierSynonym != null && !modifierSynonym.getOrthForm().equals("Ei"))
|| (modifierRelated != null && !modifierRelated.getOrthForm().equals("Ei"))
|| (modifierPwRelated != null && !modifierPwRelated.getOrthForm().equals("Ei")))
&& !(compound.getOrthForm().startsWith("Eier"))) {
// System.out.println(noun + " " + id + ": remove modifier \"Ei\"");
modifiers.remove("Ei");
}
// System.out.println("fuge=" + compound.getOrthForm().length() + " " + modifierHypernym.getHypernym().getOrthForm().length() + " " + headHypernym.getHypernym().getOrthForm().length());
// System.out.println("fuge=" + compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length()));
// if (modifierHypernym.getHypernym().getOrthForm().length() + headHypernym.getHypernym().getOrthForm().length() <= compound.getOrthForm().length()) {
// System.out.println("fuge=" + compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length(), compound.getOrthForm().length() - headHypernym.getHypernym().getOrthForm().length()));
// }
if (headPwRelated != null && modifierPwRelated != null
&& modifierPwRelated.getOrthForm().length() + headPwRelated.getOrthForm().length() <= compound.getOrthForm().length()
&& compound.getOrthForm().substring(modifierPwRelated.getOrthForm().length(), compound.getOrthForm().length() - headPwRelated.getOrthForm().length()).matches("(e|n|s|er|en|es|ens|)")) {
// System.out.println("fuge=" + compound.getOrthForm().substring(headPwRelated.getOrthForm().length(), compound.getOrthForm().length() - modifierPwRelated.getOrthForm().length()));
String head = headPwRelated.getOrthForm();
String modifier = modifierPwRelated.getOrthForm();
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head and modifier are part-whole related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head and modifier are part-whole related", germaNet);
} else if (headHypernym != null && modifierHypernym != null
&& modifierHypernym.getHypernym().getOrthForm().length() + headHypernym.getHypernym().getOrthForm().length() <= compound.getOrthForm().length()
&& compound.getOrthForm().substring(modifierHypernym.getHypernym().getOrthForm().length(), compound.getOrthForm().length() - headHypernym.getHypernym().getOrthForm().length()).matches("(e|n|s|er|en|es|ens|)")) {
// System.out.println(noun + ": headHypernym=" + headHypernym.getOrthForm()
// + ", modifierHypernym=" + modifierHypernym.getOrthForm());
String head = headHypernym.getHypernym().getOrthForm();
String modifier = modifierHypernym.getHypernym().getOrthForm();
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head and modifier are hypernyms", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head and modifier are hypernyms", germaNet);
} else if(!heads.isEmpty() && !modifiers.isEmpty()) {
for (String head : heads) {
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
if (indexOfHead == -1) {
indexOfHead = noun.toLowerCase().lastIndexOf(head.toLowerCase());
}
}
// System.out.println("indexOfHead=" + indexOfHead + ", noun.substring(0, indexOfHead)=" + noun.substring(0, indexOfHead));
if (modifiers.contains(noun.substring(0, indexOfHead))) {
String[] potentialCompound = {noun.substring(0, indexOfHead), head};
potentialCompounds_completeMatch.add(potentialCompound);
// System.out.println("potentialCompounds_exactMatches: " + noun.substring(0, indexOfHead) + " " + head);
} else if (modifiers.contains(noun.substring(0, indexOfHead) + "e")) {
String[] potentialCompound = {noun.substring(0, indexOfHead) + "e", head};
potentialCompounds_withModifierE.add(potentialCompound);
// System.out.println("potentialCompounds_withModifierE: " + noun.substring(0, indexOfHead) + "e " + head);
} else if (modifiers.contains(noun.substring(0, indexOfHead - 1))
&& (noun.substring(indexOfHead - 1, indexOfHead).equals("e")
|| noun.substring(indexOfHead - 1, indexOfHead).equals("n")
|| noun.substring(indexOfHead - 1, indexOfHead).equals("s"))) {
String[] potentialCompound = {noun.substring(0, indexOfHead - 1), head};
potentialCompounds_withInterfix.add(potentialCompound);
// System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 1) + " " + head);
} else if (modifiers.contains(noun.substring(0, indexOfHead - 2))
&& (noun.substring(indexOfHead - 2, indexOfHead).equals("en")
|| noun.substring(indexOfHead - 2, indexOfHead).equals("er")
|| noun.substring(indexOfHead - 2, indexOfHead).equals("es"))) {
String[] potentialCompound = {noun.substring(0, indexOfHead - 2), head};
potentialCompounds_withInterfix.add(potentialCompound);
// System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 2) + " " + head);
} else if (indexOfHead > 2
&& modifiers.contains(noun.substring(0, indexOfHead - 3))
&& noun.substring(indexOfHead - 3, indexOfHead).equals("ens")) {
String[] potentialCompound = {noun.substring(0, indexOfHead - 3), head};
potentialCompounds_withInterfix.add(potentialCompound);
// System.out.println("potentialCompounds_withInterfix: " + noun.substring(0, indexOfHead - 3) + " " + head);
} else if (modifiers.contains(noun.substring(0, 1).toLowerCase() + noun.substring(1, indexOfHead))) {
String[] potentialCompound = {noun.substring(0, 1).toLowerCase() + noun.substring(1, indexOfHead), head};
potentialCompounds_completeMatchSmallModifier.add(potentialCompound);
// System.out.println("potentialCompounds_exactMatches: " + noun.substring(0, 1).toLowerCase() + noun.substring(0, indexOfHead) + " " + head);
}
}
if (potentialCompounds_completeMatch.size() == 1) {
String modifier = potentialCompounds_completeMatch.get(0)[0];
String head = potentialCompounds_completeMatch.get(0)[1];
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: 1 complete match", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: 1 complete match", germaNet);
} else if (potentialCompounds_completeMatch.size() > 1) {
if (headHypernym != null) {
String comment = "GN: more complete matches, head is hypernym";
String head = headHypernym.getHypernym().getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatch) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierPwRelated != null) {
String comment = "GN: more complete matches, modifier is part-whole related";
String modifier = modifierPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatch) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headPwRelated != null) {
String comment = "GN: more complete matches, head is part-whole related";
String head = headPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatch) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierSynonym != null) {
String comment = "GN: more complete matches, modifier has synonym";
String modifier = modifierSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatch) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headSynonym != null) {
String comment = "GN: more complete matches, head has synonym";
String head = headSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatch) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
String comment = "GN: more complete matches: ";
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding,
noun + ": modifiers=" + modifiers + ", heads=" + heads + "\n", true);
for (String[] modifier_head : potentialCompounds_completeMatch) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-c-" + modifier_head[1] + "\n\n", true);
comment += modifier_head[0] + " " + modifier_head[1] + "; ";
}
return new CompoundDBEntry("", "", "", id, comment);
} else if ((potentialCompounds_withInterfix.size() + potentialCompounds_withModifierE.size()) == 1) {
if (potentialCompounds_withInterfix.size() == 1) {
String modifier = potentialCompounds_withInterfix.get(0)[0];
String head = potentialCompounds_withInterfix.get(0)[1];
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: interfix", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: interfix", germaNet);
} else {
String modifier = potentialCompounds_withModifierE.get(0)[0];
String head = potentialCompounds_withModifierE.get(0)[1];
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: modifier-e", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: modifier-e", germaNet);
}
} else if ((potentialCompounds_withInterfix.size() + potentialCompounds_withModifierE.size()) > 1) {
if (headHypernym != null) {
String comment = "GN: more matches with interfix or modifier-e, head is hypernym";
String head = headHypernym.getHypernym().getOrthForm();
for (String[] modifier_head : potentialCompounds_withInterfix) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierPwRelated != null) {
String comment = "GN: more matches with interfix or modifier-e, modifier is part-whole related";
String modifier = modifierPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_withInterfix) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headPwRelated != null) {
String comment = "GN: more matches with interfix or modifier-e, head is part-whole related";
String head = headPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_withInterfix) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierSynonym != null) {
String comment = "GN: more matches with interfix or modifier-e, modifier has synonym";
String modifier = modifierSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_withInterfix) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headSynonym != null) {
String comment = "GN: more matches with interfix or modifier-e, head has synonym";
String head = headSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_withInterfix) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
String comment = "GN: more matches with interfix or modifier-e: ";
for (String[] modifier_head : potentialCompounds_withInterfix) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-i-" + modifier_head[1] + "\n\n", true);
comment += modifier_head[0] + " " + modifier_head[1] + "; ";
}
for (String[] modifier_head : potentialCompounds_withModifierE) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-e-" + modifier_head[1] + "\n\n", true);
comment += modifier_head[0] + " " + modifier_head[1] + "; ";
}
return new CompoundDBEntry("", "", "", id, comment);
} else if (potentialCompounds_completeMatchSmallModifier.size() > 0) {
if (potentialCompounds_completeMatchSmallModifier.size() == 1) {
String modifier = potentialCompounds_completeMatchSmallModifier.get(0)[0];
String head = potentialCompounds_completeMatchSmallModifier.get(0)[1];
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: 1 complete match with modifier small case", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: complete match with modifier small case", germaNet);
}
if (headHypernym != null) {
String comment = "GN: more complete matches with modifier small case, head is hypernym";
String head = headHypernym.getHypernym().getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierPwRelated != null) {
String comment = "GN: more complete matches with modifier small case, modifier is part-whole related";
String modifier = modifierPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headPwRelated != null) {
String comment = "GN: more complete matches with modifier small case, head is part-whole related";
String head = headPwRelated.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
if (modifierSynonym != null) {
String comment = "GN: more complete matches with modifier small case, modifier has synonym";
String modifier = modifierSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
if (modifier_head[0].equals(modifier)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, modifier_head[1], id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, modifier_head[1], id, comment, germaNet);
}
}
}
if (headSynonym != null) {
String comment = "GN: more complete matches with modifier small case, head has synonym";
String head = headSynonym.getOrthForm();
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
if (modifier_head[1].equals(head)) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier_head[0], head, id,
comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier_head[0], head, id, comment, germaNet);
}
}
}
String comment = "GN: more complete matches with modifier small case: ";
for (String[] modifier_head : potentialCompounds_completeMatchSmallModifier) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "\n" + modifier_head[0] + "-s-" + modifier_head[1] + "\n\n", true);
comment += modifier_head[0] + " " + modifier_head[1] + "; ";
}
return new CompoundDBEntry("", "", "", id, comment);
// } else if (modifierSynonym != null) {
// String modifier = modifierSynonym.getOrthForm();
// String head = noun.substring(modifier.length(), noun.length());
//
// for (int cut = 0; cut < 3; cut++) {
// // head with first letter upper case
// head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1);
// if (!germaNet.getLexUnits(head).isEmpty()) {
// break;
// }
//
// // head all small
// head = head.substring(cut);
// if (!germaNet.getLexUnits(head).isEmpty()) {
// break;
// }
//
// // in case no match yet, use original split with first letter upper case
// head = noun.substring(modifier.length(), noun.length());
// head = head.substring(0, 1).toUpperCase() + head.substring(1);
// }
// System.out.println("head=" + head);
//
// if (!head.equals("Schaft")) {
// CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
// (new CompoundDBEntry(modifier, head, id,
// "GN: modifier has synonym", germaNet)).toSQLString() + "\n", true);
// return new CompoundDBEntry(modifier, head, id,
// "GN: modifier has synonym", germaNet);
// }
} else if (modifierPwRelated != null) {
String modifier = modifierPwRelated.getOrthForm();
String head = noun.substring(modifier.length(), noun.length());
for (int cut = 0; cut < 3; cut++) {
// head with first letter upper case
head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// head all small
head = head.substring(cut);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// in case no match yet, use original split with first letter upper case
head = noun.substring(modifier.length(), noun.length());
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
// System.out.println("head=" + head);
if (!head.equals("Schaft")) {
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: modifier is part-whole related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: modifier is part-whole related", germaNet);
}
} else if (headSynonym != null) {
String head = headSynonym.getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head has synonym", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head has synonym", germaNet);
} else if (headPwRelated != null) {
String head = headPwRelated.getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head is part-whole related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head is part-whole related", germaNet);
} else if (headHypernym != null) {
String head = headHypernym.getHypernym().getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head is hypernym", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head is hypernym", germaNet);
} else {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding,
noun + ": modifiers=" + modifiers + ", heads=" + heads + " (no match)\n", true);
}
} else if (headPwRelated != null) {
String head = headPwRelated.getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
for (int cut = 1; cut < 4; cut++) {
if (modifier.length() > cut && modifiers.contains(modifier.substring(0, modifier.length() - cut))) {
modifier = modifier.substring(0, modifier.length() - cut);
break;
}
}
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head is part-whole related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head is part-whole related", germaNet);
} else if (modifierPwRelated != null) {
String modifier = modifierPwRelated.getOrthForm();
String head = noun.substring(modifier.length(), noun.length());
for (int cut = 0; cut < 3; cut++) {
// head with first letter upper case
head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// head all small
head = head.substring(cut);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// in case no match yet, use original split with first letter upper case
head = noun.substring(modifier.length(), noun.length());
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
// System.out.println("head=" + head);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: modifier is part-whole related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: modifier is part-whole related", germaNet);
} else if (headRelated != null) {
String head = headRelated.getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
for (int cut = 1; cut < 4; cut++) {
if (modifier.length() > cut && modifiers.contains(modifier.substring(0, modifier.length() - cut))) {
modifier = modifier.substring(0, modifier.length() - cut);
break;
}
}
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head is related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head is related", germaNet);
} else if (modifierRelated != null) {
String modifier = modifierRelated.getOrthForm();
String head = noun.substring(modifier.length(), noun.length());
for (int cut = 0; cut < 3; cut++) {
// head with first letter upper case
head = head.substring(cut, cut+1).toUpperCase() + head.substring(cut+1);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// head all small
head = head.substring(cut);
if (!germaNet.getLexUnits(head).isEmpty()) {
break;
}
// in case no match yet, use original split with first letter upper case
head = noun.substring(modifier.length(), noun.length());
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
// System.out.println("head=" + head);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: modifier is related", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: modifiers is related", germaNet);
} else if (headHypernym != null) {
String head = headHypernym.getHypernym().getOrthForm();
int indexOfHead = noun.lastIndexOf(head.toLowerCase());
if (indexOfHead == -1) {
indexOfHead = noun.lastIndexOf(head);
}
String modifier = noun.substring(0, indexOfHead);
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: head is hypernym", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: head is hypernym", germaNet);
} else if (modifierHypernym != null) {
String modifier = modifierHypernym.getHypernym().getOrthForm();
String head = noun.substring(modifier.length());
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding,
(new CompoundDBEntry(modifier, head, id,
"GN: modifier is hypernym", germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id,
"GN: modifier is hypernym", germaNet);
} else {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding,
noun + ": modifiers=" + modifiers + ", heads=" + heads + "\n", true);
}
return new CompoundDBEntry("", "", "", id, "");
}
public void setFileEnding(String fileEnding) throws IOException {
this.fileEnding = fileEnding;
CompoundDeterminer.writeFile(compoundsDeterminedWithGNFile + fileEnding, "", false);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithGNFile + fileEnding, "", false);
}
}
package decompounder;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import java.io.FileInputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author Verena Henrich
*/
public class CompoundDeterminerSMOR {
private GermaNet germaNet;
private final Connection connection;
// private String pathToSMORTool = "/Users/vhenrich/unison/arbuckle/Tools_Resources/SMOR/";
private String pathToSMORTool = "/Users/bcmpbell/eclipse-workspace/Decompounder-mvn/src/main/resources/SMOR/";
private String smorResultsLatin1 = "tmp/smor_results_latin1.out";
private String smorResultsUTF8 = "tmp/smor_results_utf8.out";
private String allNounsInGermaNet = "tmp/all_nouns_in_germanet.txt";
private String compoundsDeterminedWithSMORFile = "compounds_determined_with_smor";
private String nonCompoundsDeterminedWithSMORFile = "non_compounds_determined_with_smor";
private String fileEnding = "";
private Map<String, List<String>> nouns_smorAnalyses;
private static final String GET_ALL_NOUN_ORTH_FORMS_WITHOUT_HYPHEN_QUERY = "SELECT distinct l.orth_form AS orth_form "
+ "FROM lex_unit_table l, synset_table s WHERE l.synset_id = s.id "
+ "and s.word_category_id = 1 and length(l.orth_form) > 3 "
+ "and l.orth_form not like '% %' and l.orth_form not like '%-%' "
+ "order by l.orth_form";
public CompoundDeterminerSMOR(GermaNet germaNet, Connection connection) throws SQLException, IOException {
this.germaNet = germaNet;
this.connection = connection;
prepareSMORAnalysis(); // only needed to extract new noun list from GermaNet
determineSMORResults();
}
private void prepareSMORAnalysis() throws SQLException, IOException {
CompoundDeterminer.writeFile(allNounsInGermaNet, "", false);
CompoundDeterminer.writeFile(smorResultsLatin1, "", false);
PreparedStatement findNounsStatement = connection.prepareStatement(GET_ALL_NOUN_ORTH_FORMS_WITHOUT_HYPHEN_QUERY);
ResultSet results = findNounsStatement.executeQuery();
while (results.next()) {
CompoundDeterminer.writeFile(allNounsInGermaNet, results.getString("orth_form") + "\n", true, CompoundDeterminer.latin1Encoding);
}
results.close();
String smorCommand = pathToSMORTool + "src/fst-infl2 -d -t "
+ pathToSMORTool + "lib/smor.ca -t "
+ pathToSMORTool + "lib/smor-cap.ca -t "
+ pathToSMORTool + "lib/smor-uc.ca -t "
+ pathToSMORTool + "lib/smor-ss.ca -t "
+ pathToSMORTool + "lib/smor-ascii.ca "
+ pathToSMORTool + "lib/smor-guesser.ca "
+ CompoundDeterminer.resultsFolder + allNounsInGermaNet + " " + CompoundDeterminer.resultsFolder + smorResultsLatin1;
try {
Process smorProcess = Runtime.getRuntime().exec(smorCommand);
smorProcess.waitFor();
CompoundDeterminer.writeFile(smorResultsUTF8, "", false);
Scanner scanner = new Scanner(new FileInputStream(CompoundDeterminer.resultsFolder + smorResultsLatin1),
CompoundDeterminer.latin1Encoding);
try {
while (scanner.hasNextLine()){
String line = scanner.nextLine();
CompoundDeterminer.writeFile(smorResultsUTF8, line + "\n", true);
}
} finally {
scanner.close();
}
} catch (InterruptedException ex) {
Logger.getLogger(CompoundDeterminer.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void determineSMORResults() throws SQLException, IOException {
Scanner scanner = new Scanner(new FileInputStream(CompoundDeterminer.resultsFolder + smorResultsUTF8),
CompoundDeterminer.utf8Encoding);
nouns_smorAnalyses = new HashMap<String, List<String>>();
try {
String currentOrthForm = "";
List<String> smorAnalyses = new ArrayList<String>();
while (scanner.hasNextLine()){
String line = scanner.nextLine();
if (line.startsWith("<")) {
line = line.substring(line.indexOf(">") + 1);
}
if (line.startsWith("> ")) {
if (smorAnalyses.size() > 0) {
// System.out.println("nouns_smorAnalyses.put:" + currentOrthForm + " " + smorAnalyses.get(0));
nouns_smorAnalyses.put(currentOrthForm, smorAnalyses);
smorAnalyses = new ArrayList<String>();
}
currentOrthForm = line.substring(2);
// System.out.println("\ncurrentOrthForm=" + currentOrthForm);
} else if (!line.startsWith("no result for ")) {
if (line.matches(".*<PREF>.*")) {
// System.out.println("pref");
int prefIndex = line.indexOf("<PREF>");
if (prefIndex >= 0) {
// System.out.println("line.substring(prefIndex + 6)=" + line.substring(prefIndex + 6, prefIndex + 7));
if (line.substring(prefIndex + 6, prefIndex + 7).matches("[A-ZÄÖÜ]")) {
String tmpLine = line;
// System.out.println("line1=" + line);
line = "";
int wordStart = tmpLine.substring(0, prefIndex).lastIndexOf(">") + 1;
if (wordStart < 0) {
wordStart = 0;
} else {
line += tmpLine.substring(0, wordStart);
}
int wordEnd = tmpLine.substring(prefIndex + 6).indexOf("<") - 1;
if (wordEnd < 0) {
wordEnd = tmpLine.length() - 1;
}
// System.out.println("start=" + wordStart + ", end=" + wordEnd);
// System.out.println(line.substring(wordStart, wordStart + 1).toUpperCase());
// System.out.println(line.substring(wordStart + 1, prefIndex));
// System.out.println(line.substring(prefIndex + 6, prefIndex + 6 + wordEnd).toLowerCase());
// System.out.println(line.substring( prefIndex + 6 + wordEnd));
line += tmpLine.substring(wordStart, wordStart + 1).toUpperCase()
+ tmpLine.substring(wordStart + 1, prefIndex)
+ tmpLine.substring(prefIndex + 6, prefIndex + 6 + wordEnd).toLowerCase()
+ tmpLine.substring(prefIndex + 6 + wordEnd);
// System.out.println("line2=" + line);
} else {
// System.out.println("line3=" + line);
line = line.replaceAll("<PREF>", "");
// System.out.println("line4=" + line);
}
}
}
line = line.replaceAll("<VPART>", "").replaceAll("<VPREF>", "");//.replaceAll("<PREF>", "PREF");
line = line.replaceAll("<Fem>", "").replaceAll("<Masc>", "").replaceAll("<Neut>", "");
line = line.replaceAll("<Acc>", "").replaceAll("<Dat>", "").replaceAll("<Gen>", "").replaceAll("<Nom>", "");
line = line.replaceAll("<Sg>", "").replaceAll("<Pl>", "");
line = line.replaceAll("<Simp>", "").replaceAll("<Pos>", "").replaceAll("<Invar>", "");
line = line.replaceAll("<NEWORTH>", "").replaceAll("<OLDORTH>", "").replaceAll("<Old>", "");
// if (line.split(">(\\w|\\ä|\\ö|\\ü)").length == 2) {
// line = line.replaceAll(">er<SUFF><\\+NN>", ">").replaceAll(">ler<SUFF><\\+NN>", ">");;
// }
String[] splittedLine = line.split(">(\\w|\\ä|\\ö|\\ü)");
String smorResult = "";
Integer index = -1;
if (splittedLine.length > 1) {
for (int i = 0; i < splittedLine.length; i++) {
// System.out.println("smor: " + splittedLine[i] + " " + splittedLine[i].matches(".*PREF.*"));
// System.out.println(splittedLine[i].substring(splittedLine[i].indexOf("PREF") + 5, 1));
// System.out.println(splittedLine[i].substring(splittedLine[i].indexOf("PREF") + 5, 1).matches("[A-ZÄÖÜ]"));
// int prefIndex = splittedLine[i].indexOf("PREF");
// if (prefIndex >= 0
// && splittedLine[i].substring(prefIndex + 4, prefIndex + 5).matches("[A-ZÄÖÜ]")) {
// System.out.print("smor PREF " + splittedLine[i]);
// splittedLine[i] = splittedLine[i].substring(0, 1).toUpperCase()
// + splittedLine[i].substring(1, splittedLine[i].indexOf("<")).replace("PREF", "").toLowerCase()
// + splittedLine[i].substring(splittedLine[i].indexOf("<"));
// System.out.println(" --> " + splittedLine[i]);
// }
index = splittedLine[i].length() + 1;
if (splittedLine.length > 2 && i > 0 && i < splittedLine.length - 1) {
index++;
}
String nounEnding = "";
int nounEndingWithTagsLength = 0;
if (i < splittedLine.length - 1) {
// if (line.substring(index).startsWith("bar<ADJ><SUFF>keit<SUFF><+NN>")) {
// nounEnding = "barkeit";
// nounEndingWithTagsLength = "bar<ADJ><SUFF>keit<SUFF><+NN>".length();
// } else if (line.substring(index).startsWith("bar<ADJ><SUFF>keit<NN><SUFF>")) {
// nounEnding = "barkeit";
// nounEndingWithTagsLength = "bar<ADJ><SUFF>keit<NN><SUFF>".length();
if (line.substring(index).startsWith("bar<ADJ><SUFF>")) {
nounEnding = "bar";
nounEndingWithTagsLength = "bar<ADJ><SUFF>".length();
} else if (line.substring(index).startsWith("tum<SUFF><+NN>")) {
nounEnding = "tum";
nounEndingWithTagsLength = "tum<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("ung<SUFF><+NN>")) {
nounEnding = "ung";
nounEndingWithTagsLength = "ung<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("ung<NN><SUFF>")) {
nounEnding = "ung";
nounEndingWithTagsLength = "ung<NN><SUFF>".length();
} else if (line.substring(index).startsWith("ie<SUFF><+NN>")) {
nounEnding = "ie";
nounEndingWithTagsLength = "ie<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("er<SUFF><+NN>")) {
nounEnding = "er";
nounEndingWithTagsLength = "er<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("er<NN><SUFF>")) {
nounEnding = "er";
nounEndingWithTagsLength = "er<NN><SUFF>".length();
} else if (line.substring(index).startsWith("ler<NN><SUFF>")) {
nounEnding = "ler";
nounEndingWithTagsLength = "ler<NN><SUFF>".length();
} else if (line.substring(index).startsWith("ler<SUFF><+NN>")) {
nounEnding = "ler";
nounEndingWithTagsLength = "ler<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("keit<SUFF><+NN>")) {
nounEnding = "keit";
nounEndingWithTagsLength = "keit<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("keit<NN><SUFF>")) {
nounEnding = "keit";
nounEndingWithTagsLength = "keit<NN><SUFF>".length();
} else if (line.substring(index).startsWith("heit<SUFF><+NN>")) {
nounEnding = "heit";
nounEndingWithTagsLength = "heit<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("chen<SUFF><+NN>")) {
nounEnding = "chen";
nounEndingWithTagsLength = "chen<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("igkeit<SUFF><+NN>")) {
nounEnding = "igkeit";
nounEndingWithTagsLength = "igkeit<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("schaft<SUFF><+NN>")) {
nounEnding = "schaft";
nounEndingWithTagsLength = "schaft<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("erei<SUFF><+NN>")) {
nounEnding = "erei";
nounEndingWithTagsLength = "erei<SUFF><+NN>".length();
} else if (line.substring(index).startsWith("ei<SUFF><+NN>")) {
nounEnding = "ei";
nounEndingWithTagsLength = "ei<SUFF><+NN>".length();
}
}
if (!nounEnding.equals("")) {
// System.out.println("line=" + line + ", indexof=" + line.substring(0, line.indexOf("<") - 2));
int wordLength = line.indexOf("<");
int start = 0;
if (i > 0) {
int cut = 2;
while (wordLength > cut && start < 1) {
start = currentOrthForm.toLowerCase().indexOf(line.substring(0, wordLength - cut).toLowerCase());
cut++;
}
}
int end = -1;
if (i == splittedLine.length - 2) {
end = currentOrthForm.length();
} else if (currentOrthForm.indexOf(nounEnding, start + wordLength - nounEnding.length()) != -1) {
end = currentOrthForm.indexOf(nounEnding, start + wordLength - nounEnding.length()) + nounEnding.length();
}
// System.out.println("start=" + start + ", end=" + end + ", wordlength=" + wordLength + ", nounEnding=" + nounEnding);
if (start == -1) {
smorResult += line.substring(0, index) + nounEnding;
} else if (end == -1) {
smorResult += line.substring(0, index + nounEndingWithTagsLength);
} else {
smorResult += currentOrthForm.substring(start, start+1).toUpperCase() + currentOrthForm.substring(start+1, end) + "<NN>";
}
// System.out.println("smorresult=" + smorResult);
index += nounEndingWithTagsLength;
i++;
} else {
smorResult += line.substring(0, index);
}
if (i < splittedLine.length - 1) {
smorResult += " ";
}
line = line.substring(index);
}
smorResult = smorResult.replaceAll("<NN> in<SUFF><\\+NN>", "in<+NN>");
smorResult = smorResult.replaceAll(" in<SUFF><\\+NN>", "in<+NN>");
smorResult = smorResult.replaceAll(" keit<SUFF><\\+NN>", "keit<+NN>");
smorResult = smorResult.replaceAll(" keit<NN><SUFF>", "keit<NN>");
if (!smorAnalyses.contains(smorResult)) {
// System.out.println(currentOrthForm + ": " + smorResult);
smorAnalyses.add(smorResult);
}
}
}
}
} finally {
scanner.close();
}
}
/**
* Probleme mit SMOR:
* - Amtsärztin --> Amt+Arztin
* - Türklinke --> Türe+Klinke
* - viele Wörter, die es als Nomen und Verben gibt, werden als Verben angegeben (Denominalisierung),
* Bsp: -ung, -keit, -heit
* - Festmacherleine --> festmachener+Leine
*
* @param id
* @param noun
* @param smorAnalyses
* @return
* @throws IOException
* @throws SQLException
*/
public CompoundDBEntry determineCompoundWithSMOR(int id, String noun) throws IOException, SQLException {
List<String> smorAnalyses = nouns_smorAnalyses.get(noun);
String modifier = "";
String head = "";
String comment = "";
if (smorAnalyses != null) {
if (smorAnalyses.size() == 1) { // only one SMOR result
String[] smorResult = smorAnalyses.get(0).split(" ");
if (smorResult.length == 2) { // only one binary SMOR result
modifier = smorResult[0].split("<")[0];
head = smorResult[1].split("<")[0];
comment = "SMOR: only one binary result: " + smorAnalyses.get(0);
CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding,
(new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id, comment, germaNet);
} else if (smorResult.length > 2) { // only one non-binary SMOR result
comment = "SMOR: only one non binary result: " + smorAnalyses.get(0);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding,
noun + " " + comment + "\n", true);
return new CompoundDBEntry(noun, modifier, head, id, comment);
}
} else { // more than one SMOR results
List<String> smorAnalysesWithoutRegardingTags = new ArrayList<String>();
List<String> smorAnalysesWithoutRegardingTags_withTags = new ArrayList<String>();
List<String> smorAnalysesBinaryOnly = new ArrayList<String>();
List<String> smorAnalysesBinaryOnly_withTags = new ArrayList<String>();
// filter SMOR results that are equal when tags are not regarded and
// extract binary results
for (String smorResult : smorAnalyses) {
String smorResultWithoutTags = smorResult.replaceAll("<[^>]+>", "");
if (!smorAnalysesWithoutRegardingTags.contains(smorResultWithoutTags)) {
smorAnalysesWithoutRegardingTags.add(smorResultWithoutTags);
smorAnalysesWithoutRegardingTags_withTags.add(smorResult);
if (smorResultWithoutTags.split(" ").length == 2) {
smorAnalysesBinaryOnly.add(smorResultWithoutTags);
smorAnalysesBinaryOnly_withTags.add(smorResult);
}
}
// CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile,
// noun + ": " + smorResult + " -- " + smorResultWithoutTags + "\n", true);
}
if (smorAnalysesWithoutRegardingTags_withTags.size() == 1
&& smorAnalysesBinaryOnly_withTags.size() == 1) { // only one binary result left after filtering
String[] smorAnalyse = smorAnalysesWithoutRegardingTags_withTags.get(0).split(" ");
modifier = smorAnalyse[0].split("<")[0];
head = smorAnalyse[1].split("<")[0];
comment = "SMOR: only one binary result (without regarding tags): " + smorAnalysesBinaryOnly_withTags.get(0);
CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding,
(new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id, comment, germaNet);
} else if (smorAnalysesWithoutRegardingTags_withTags.size() == smorAnalysesBinaryOnly_withTags.size()) { // more than one (only binary) results left after filtering
List<String> smorAnalysesPotentialHeads = new ArrayList<String>();
List<String> smorAnalysesPotentialModifiers = new ArrayList<String>();
// extract potential heads and modifiers
for (String smorResult_withoutTags : smorAnalysesBinaryOnly) {
if (!smorAnalysesPotentialHeads.contains(smorResult_withoutTags.split(" ")[1])) {
smorAnalysesPotentialHeads.add(smorResult_withoutTags.split(" ")[1]);
}
if (!smorAnalysesPotentialModifiers.contains(smorResult_withoutTags.split(" ")[0])) {
smorAnalysesPotentialModifiers.add(smorResult_withoutTags.split(" ")[0]);
}
}
// extract headHypernym
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(id), smorAnalysesPotentialHeads, germaNet);
String headHypernymString = "";
if (headHypernym != null && headHypernym.getDistance() < 5) {
headHypernymString = headHypernym.getHypernym().getOrthForm();
}
// extract modifierHypernym
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(id), smorAnalysesPotentialHeads, germaNet);
String modifierHypernymString = "";
if (modifierHypernym != null && modifierHypernym.getDistance() < 5) {
modifierHypernymString = modifierHypernym.getHypernym().getOrthForm();
}
// extract matches for headHypernym and modifierHypernym
List<String> headHypernymMatch = new ArrayList<String>();
List<String> modifierHypernymMatch = new ArrayList<String>();
for (String smorResult_withTags : smorAnalysesBinaryOnly_withTags) {
if (smorResult_withTags.split(" ")[1].split("<")[0].equals(headHypernymString)) {
headHypernymMatch.add(smorResult_withTags);
}
if (smorResult_withTags.split(" ")[0].split("<")[0].equals(modifierHypernymString)) {
modifierHypernymMatch.add(smorResult_withTags);
}
}
if (headHypernymMatch.size() == 1) {
modifier = headHypernymMatch.get(0).split(" ")[0].split("<")[0];
head = headHypernymMatch.get(0).split(" ")[1].split("<")[0];
comment = "SMOR: head is hypernym: " + headHypernymMatch.get(0);
CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding,
(new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id, comment, germaNet);
}
if (modifierHypernymMatch.size() == 1) {
modifier = modifierHypernymMatch.get(0).split(" ")[0].split("<")[0];
head = modifierHypernymMatch.get(0).split(" ")[1].split("<")[0];
comment = "SMOR: modifier is hypernym: " + modifierHypernymMatch.get(0);
CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding,
(new CompoundDBEntry(modifier, head, id, comment, germaNet)).toSQLString() + "\n", true);
return new CompoundDBEntry(modifier, head, id, comment, germaNet);
}
// CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile,
// noun + " (binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.size() + " "
// + smorAnalysesWithoutRegardingTags.size() + " "
// + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true);
comment = "SMOR: binary after filtering: ";
for (String string : smorAnalysesWithoutRegardingTags_withTags) {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding,
noun + " (binary after filtering): " + string + "\n", true);
comment += string + "; ";
}
return new CompoundDBEntry(noun, modifier, head, id, comment);
} else if (smorAnalysesWithoutRegardingTags_withTags.size() == 1) { // only one (non-binary) result left after filtering
// CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile,
// noun + " (one non binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.size() + " "
// + smorAnalysesWithoutRegardingTags.size() + " "
// + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true);
comment = "SMOR: one non binary after filtering): " + smorAnalysesWithoutRegardingTags_withTags.get(0);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding,
noun + " " + comment + "\n\n", true);
return new CompoundDBEntry(noun, modifier, head, id, comment);
} else { // more than one (non-binary) results left after filtering
// CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile,
// noun + " (more non binary): " + smorAnalysesWithoutRegardingTags_withTags.size() + " "
// + smorAnalysesWithoutRegardingTags.size() + " "
// + smorAnalysesBinaryOnly.size() + " " + smorAnalysesBinaryOnly_withTags.size() + "\n", true);
comment = "SMOR: more non binary: ";
for (String string : smorAnalysesWithoutRegardingTags_withTags) {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding,
noun + " (more non binary): " + string + "\n", true);
comment += string + "; ";
}
return new CompoundDBEntry(noun, modifier, head, id, comment);
}
}
}
return new CompoundDBEntry(noun, modifier, head, id, comment);
}
public void setFileEnding(String fileEnding) throws IOException {
this.fileEnding = fileEnding;
CompoundDeterminer.writeFile(compoundsDeterminedWithSMORFile + fileEnding, "", false);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithSMORFile + fileEnding, "", false);
}
}
package decompounder;
import de.tuebingen.uni.sfs.germanet.dbapi.ConRelType;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexRelType;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import de.tuebingen.uni.sfs.germanet.dbapi.Synset;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDisambiguator {
/**
* Return hypernym of compound, i.e., correct reading of potentialHypernym.
* Return null if potentialHypernym is no hypernym of compound.
*
* @param compound
* @param potentialHypernym
* @param germaNet
* @return
*/
public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, String potentialHypernym, GermaNet germaNet) {
List<String> potentialHypernymList = new ArrayList<String>();
potentialHypernymList.add(potentialHypernym);
return disambiguateHypernym(compound, potentialHypernymList, germaNet);
}
// public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, List<String> potentialHypernyms, GermaNet germaNet) {
// return disambiguateHypernym(compound, potentialHypernyms, germaNet, 1000);
// }
public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, List<String> potentialHypernyms, GermaNet germaNet) {
LexUnit hypernym = null;
int distance = -1;
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
for (String potentialHead : potentialHypernyms) {
lexUnits.addAll(germaNet.getLexUnits(potentialHead));
}
int i = 1, j = 1;
List<List<Synset>> allHypernyms = germaNet.getLexUnitByID(compound.getId()).getSynset().getAllHyperonyms();
for (List<Synset> list : allHypernyms) {
if (hypernym != null) {
break;
}
for (Synset synset : list) {
boolean matchInSameSynset = false;
// System.out.println("i=" + i + ", j=" + j + ", " + synset.getAllOrthForms());
for (LexUnit lexUnit : synset.getLexUnits()) {
if (potentialHypernyms.contains(lexUnit.getOrthForm())) {
// System.out.println("match on level " + i);
if (hypernym == null) {
hypernym = lexUnit;
distance = i;
matchInSameSynset = true;
} else if (hypernym != null
&& (hypernym.getOrthForm().length() < lexUnit.getOrthForm().length())) {
// System.err.println(compound.getOrthForm() + ": " + lexUnit.getOrthForm()
// + " is new hypernym, instead of " + hypernym.getOrthForm());
hypernym = lexUnit;
distance = i;
}
if (!matchInSameSynset && hypernym != null) {
System.err.println(compound.getOrthForm() + " " + compound.getId() + ": two hypernyms possible ("
+ hypernym.getOrthForm() + " and " + lexUnit.getOrthForm() + ")");
}
}
}
j++;
}
i++;
// if (i > maxDistance) {
// break;
// }
}
if (hypernym != null) {
return new DisambiguatedHypernym(hypernym, distance);
} else {
return null;
}
}
public static class DisambiguatedHypernym {
private LexUnit hypernym;
private int distance;
public DisambiguatedHypernym(LexUnit head, int distance) {
this.hypernym = head;
this.distance = distance;
}
public LexUnit getHypernym() {
return hypernym;
}
public int getDistance() {
return distance;
}
}
public static LexUnit disambiguatePWRelation(LexUnit compound, List<String> potentialPWRelatedLexUnits, GermaNet germaNet) {
List<Synset> synsetsInPWR = compound.getSynset().getRelatedSynsets(ConRelType.HAS_COMPONENT_HOLONYM);
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_COMPONENT_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_MEMBER_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_MEMBER_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_PORTION_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_PORTION_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_SUBSTANCE_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_SUBSTANCE_MERONYM));
for (Synset synsetInPWR : synsetsInPWR) {
for (LexUnit lexUnit : synsetInPWR.getLexUnits()) {
if (potentialPWRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
}
return null;
}
/**
* considers all relations except part-whole relations and hypernyms
*
* @param compound
* @param potentialRelatedLexUnits
* @param germaNet
* @return
*/
public static LexUnit disambiguateRelation(LexUnit compound, List<String> potentialRelatedLexUnits, GermaNet germaNet) {
List<Synset> relatedSynsets = compound.getSynset().getRelatedSynsets(ConRelType.IS_ENTAILED_BY);
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.IS_RELATED_TO));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.CAUSES));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.ENTAILS));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(GermaNet.HAS_HYPONYM));
for (Synset synsetInPWR : relatedSynsets) {
for (LexUnit lexUnit : synsetInPWR.getLexUnits()) {
if (potentialRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
}
List<LexUnit> relatedLexUnits = compound.getRelatedLexUnits(LexRelType.HAS_ANTONYM);
relatedLexUnits.addAll(compound.getRelatedLexUnits(LexRelType.HAS_PERTAINYM));
for (LexUnit lexUnit : relatedLexUnits) {
if (potentialRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
return null;
}
public static LexUnit disambiguateSynonym(LexUnit compound, List<String> potentialSynonyms, GermaNet germaNet) {
for (LexUnit lexUnit : compound.getSynonyms()) {
if (potentialSynonyms.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
return null;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment