Commit 0b611765 authored by Ben Campbell's avatar Ben Campbell
Browse files

Initial commit

parents
File added
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry including="**/*.java" kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
/target/
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>Decompounder-mvn</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding//src/test/resources=UTF-8
encoding/<project>=UTF-8
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.6
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.tuebingen.uni.sfs.germanet</groupId>
<artifactId>Decompounder-mvn</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Decompounder</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<scm>
<url>https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn</url>
<connection>scm:git:https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn</connection>
<developerConnection>scm:git:https://weblicht.sfs.uni-tuebingen.de/gitlab/germanet/Decompounder-mvn.git
</developerConnection>
<tag>HEAD</tag>
</scm>
<repositories>
<repository>
<id>sfs-clarind-nexus</id>
<url>http://t.weblicht.sfs.uni-tuebingen.de/nexus/content/repositories/releases</url>
</repository>
<repository>
<id>sfs-clarind-nexus-snapshot</id>
<url>http://t.weblicht.sfs.uni-tuebingen.de/nexus/content/repositories/snapshots</url>
</repository>
<repository>
<id>TU-Darmstadt</id>
<url>http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-releases/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>de.tuebingen.uni.sfs.germanet</groupId>
<artifactId>GernEdiT</artifactId>
<version>9.1</version>
</dependency>
<dependency>
<groupId>antlr</groupId>
<artifactId>antlr</artifactId>
<version>2.7.6</version>
</dependency>
<dependency>
<groupId>asm</groupId>
<artifactId>asm-attrs</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>asm</groupId>
<artifactId>asm</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>cglib</groupId>
<artifactId>cglib</artifactId>
<version>2.1_3</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>ejb3-persistence</artifactId>
<version>1.0.2.GA</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>8.3-606.jdbc3</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-annotations</artifactId>
<version>3.5.6-Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-commons-annotations</artifactId>
<version>3.2.0.Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-entitymanager</artifactId>
<version>3.5.6-Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-tools</artifactId>
<version>3.5.1.Final</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate</artifactId>
<version>3.5.4-Final</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
<version>3.20.0-GA</version>
</dependency>
<dependency>
<groupId>de.uni_leipzig.asv.toolbox</groupId>
<artifactId>toolbox-utils</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>de.uni_leipzig.asv.toolbox</groupId>
<artifactId>de.uni_leipzig.asv.toolbox.baseforms</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>javax.transaction</groupId>
<artifactId>jta</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-release-plugin</artifactId>
<version>2.4.2</version>
<dependencies>
<dependency>
<groupId>org.apache.maven.scm</groupId>
<artifactId>maven-scm-provider-gitexe</artifactId>
<version>1.8.1</version>
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.7</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<failOnError>false</failOnError>
<additionalparam>-Xdoclint:none</additionalparam>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.6</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>de.tuebingen.uni.sfs.germanet.editor.GermaNetEditorApp</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<filtering>false</filtering>
<directory>src/main/java</directory>
<includes>
<include>**</include>
</includes>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</resource>
<resource>
<filtering>false</filtering>
<directory>src/main/resources</directory>
<includes>
<include>**</include>
</includes>
</resource>
</resources>
</build>
</project>
\ No newline at end of file
File added
package decompounder;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDBEntry {
private String modifier;
private String head;
private int compoundId;
private int modifierId = -1;
private int headId = -1;
private String comment = "\\N";
private boolean modifierIsInGermaNet = false;
private boolean headIsInGermaNet = false;
private int headHypernymDistance = -1;
private int modifierHypernymDistance = -1;
private String compositionalType = "\\N";
private String compound = "\\N";
public static final String COMPOSITIONAL_TYPE_EXACT_MATCH = "exact match";
public static final String COMPOSITIONAL_TYPE_MODIFIER_E = "modifier e";
public static final String COMPOSITIONAL_TYPE_INTERFIX = "interfix";
public static final String COMPOSITIONAL_TYPE_HEAD_IS_HYPERNYM = "head is hypernym";
public static final String COPY_INTO_QUERY = "COPY compounds_table "
+ "(compound, compound_id, modifier, modifier_id, head, head_id, "
+ "modifier_in_gn, head_in_gn, modifier_hypernym_distance, "
+ "head_hypernym_distance, comment) FROM stdin;\n";
public CompoundDBEntry(String compound, String modifier, int modifierId, String head, int headId,
int compoundId, String comment, boolean modifierIsInGermaNet,
boolean headIsInGermaNet, int modifierHypernymDistance, int headHypernymDistance) {
this.compound = compound;
this.modifier = modifier;
this.head = head;
this.modifierId = modifierId;
this.headId = headId;
this.compoundId = compoundId;
this.comment = comment;
this.modifierIsInGermaNet = modifierIsInGermaNet;
this.headIsInGermaNet = headIsInGermaNet;
this.modifierHypernymDistance = modifierHypernymDistance;
this.headHypernymDistance = headHypernymDistance;
}
public CompoundDBEntry(String compound, String modifier, String head, int compoundId, String comment) {
this.compound = compound;
this.modifier = modifier;
this.head = head;
this.compoundId = compoundId;
this.comment = comment;
}
public CompoundDBEntry(String modifier, String head, int compoundId,
String comment, GermaNet germaNet) {
this.modifier = modifier;
this.head = head;
this.compoundId = compoundId;
this.comment = comment;
this.modifierIsInGermaNet = true;
this.headIsInGermaNet = true;
this.compound = germaNet.getLexUnitByID(compoundId).getOrthForm();
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
DisambiguatedHypernym hypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(compoundId), modifier, germaNet);
if (hypernym == null) {
lexUnits = germaNet.getLexUnits(modifier);
if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
} else if (lexUnits.isEmpty()) {
this.modifierIsInGermaNet = false;
}
} else {
modifierId = hypernym.getHypernym().getId();
modifierHypernymDistance = hypernym.getDistance();
}
hypernym = CompoundDisambiguator.disambiguateHypernym(germaNet.getLexUnitByID(compoundId), head, germaNet);
if (hypernym == null) {
lexUnits = germaNet.getLexUnits(head);
if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
} else if (lexUnits.isEmpty()) {
this.headIsInGermaNet = false;
}
} else {
headId = hypernym.getHypernym().getId();
headHypernymDistance = hypernym.getDistance();
}
}
public Integer getCompoundId() {
return this.compoundId;
}
/**
* @return the modifier
*/
public String getModifier() {
return this.modifier;
}
/**
* @param modifier the modifier to set
*/
public void setModifier(String modifier) {
this.modifier = modifier;
}
/**
* @return the head
*/
public String getHead() {
return this.head;
}
/**
* @param head the head to set
*/
public void setHead(String head) {
this.head = head;
}
/**
* @return the modifierId
*/
public int getModifierId() {
return modifierId;
}
/**
* @param modifierId the modifierId to set
*/
public void setModifierId(int modifierId) {
this.modifierId = modifierId;
}
/**
* @return the headId
*/
public int getHeadId() {
return this.headId;
}
/**
* @param headId the headId to set
*/
public void setHeadId(int headId) {
this.headId = headId;
}
/**
* @return the isInGermaNet
*/
public boolean isModifierInGermaNet() {
return modifierIsInGermaNet;
}
/**
* @param isModifierInGermaNet the isInGermaNet to set
*/
public void setModifierIsInGermaNet(boolean modifierIsInGermaNet) {
this.modifierIsInGermaNet = modifierIsInGermaNet;
}
public boolean isHeadInGermaNet() {
return headIsInGermaNet;
}
public void setHeadIsInGermaNet(boolean headIsInGermaNet) {
this.headIsInGermaNet = headIsInGermaNet;
}
public int getModifierHypernymDistance() {
return modifierHypernymDistance;
}
public int getHeadHypernymDistance() {
return headHypernymDistance;
}
/**
* @return the comment
*/
public String getComment() {
return this.comment;
}
/**
* @param comment the comment to set
*/
public void setComment(String comment) {
this.comment = comment;
}
public boolean equals(CompoundDBEntry compoundDBEntry) {
return ((this.modifierIsInGermaNet == compoundDBEntry.isModifierInGermaNet())
&& (this.headIsInGermaNet == compoundDBEntry.isHeadInGermaNet())
&& (this.modifier.equals(compoundDBEntry.getModifier()))
&& (this.modifierId == compoundDBEntry.getModifierId())
&& (this.head.equals(compoundDBEntry.getHead()))
&& (this.headId == compoundDBEntry.getHeadId()));
}
public String toSQLString() {
String sqlString = compound + "\t" + compoundId + "\t"
+ this.modifier + "\t\t\t"
+ this.head + "\t\t\t"
+ this.comment;
return sqlString;
}
public String toSmallSQLString() {
String sqlString = "";
if (this.modifier.equals("")) {
sqlString += "\\N\t";
} else {
sqlString += this.modifier + "\t";
}
if (this.head.equals("")) {
sqlString += "\\N\t";
} else {
sqlString += this.head + "\t";
}
if (this.comment.equals("")) {
sqlString += "\\N";
} else {
sqlString += this.comment;
}
return sqlString;
}
}
This diff is collapsed.
package decompounder;
import de.uni_leipzig.asv.toolbox.baseforms.Zerleger2;
import decompounder.CompoundDisambiguator.DisambiguatedHypernym;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDeterminerASV {
private Zerleger2 zerleger = new Zerleger2();
private GermaNet germaNet;
// private String logFile = "log.txt";
private String compoundsDeterminedWithASVFile = "compounds_determined_with_asv";
private String nonCompoundsDeterminedWithASVFile = "non_compounds_determined_with_asv";
private String fileEnding = "";
public CompoundDeterminerASV(GermaNet germaNet) throws IOException {
this.germaNet = germaNet;
String red = "src/main/resources/ASV_trees/grfExt.tree"; // reduce file for splitting
String forw = "src/main/resources/ASV_trees/kompVVic.tree"; // forward file
String back = "src/main/resources/ASV_trees/kompVHic.tree"; // backward file
zerleger = new Zerleger2(); // splitter
zerleger.init(forw, back, red);
}
public CompoundDBEntry determineCompoundWithASV(int id, String noun) throws IOException {
List<String> splitted = zerleger.kZerlegung(noun);
// System.out.println(noun + " " + splitted);
if (splitted.contains(";")) {
// System.err.println(noun + " contains \";\" " + splitted);
}
while (splitted.contains("")) {
// System.err.println(noun + " contains empty entry " + splitted);
splitted.remove("");
}
if (splitted.size() > 1 && splitted.get(splitted.size()-1).equalsIgnoreCase("schaft")) {
splitted.remove(splitted.size()-1);
splitted.set(splitted.size()-1, splitted.get(splitted.size()-1) + "schaft");
// System.out.println(noun + " splitted without 'schaft'");
}
if (splitted.size() > 2
&& splitted.get(splitted.size()-1).equalsIgnoreCase("werk")
&& splitted.get(splitted.size()-2).equalsIgnoreCase("bau")) {
splitted.remove(splitted.size()-1);
splitted.set(splitted.size()-1, "bauwerk");
// System.out.println(noun + " splitted without 'schaft'");
}
if (splitted.size() < 2) { // if noun is not a compound
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": is no compound\n", true);
} else if (splitted.size() == 2) { // if noun is a binary compound
// System.out.println(noun + "\t" + splitted);
// compound as proposed by compound splitter
String modifier = splitted.get(0);
String head = splitted.get(1);
if (modifier.contains(";") && head.contains(";")) {
// System.err.println(noun + ": modifier and noun contain \";\"");
}
if (modifier.contains(";")) {
// System.out.println(modifier + "-->" + noun.substring(0, noun.length() - head.length()));
modifier = noun.substring(0, noun.length() - head.length());
}
if (head.contains(";")) {
// System.out.println(head + "-->" + noun.substring(modifier.length()));
head = noun.substring(modifier.length() + 1);
}
CompoundDBEntry compound = createCompoundDBEntry(modifier, head, id);
if (compound.isModifierInGermaNet() && compound.isHeadInGermaNet()) {
compound.setComment("ASV: 2 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true);
return compound;
} else if (compound.isHeadInGermaNet()) {
compound.setComment("ASV: 2 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound.toSQLString() + "\n", true);
return compound;
}
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " is not correctly splitted: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: is not correctly splitted: " + splitted);
} else if (splitted.size() == 3) { // if noun consists of 3 constituents
// System.out.println(noun + "\t" + splitted);
// modifier is first part by compound splitter,
// head starts with second part by compound splitter
String modifier = splitted.get(0);
// System.out.print(" modifier=" + modifier);
String constituent2_withoutLastChar = splitted.get(1).substring(0, splitted.get(1).length()-1);
// System.out.print(", c=" + constituent2_withoutLastChar);
String head = noun.substring(modifier.length());
if (noun.indexOf(constituent2_withoutLastChar, modifier.length()-1) != -1) {
head = noun.substring(noun.indexOf(constituent2_withoutLastChar, modifier.length()-1));
}
CompoundDBEntry compound1 = createCompoundDBEntry(modifier, head, id);
// System.out.println(", head=" + head);
head = splitted.get(2);
if (noun.indexOf(splitted.get(1)) != -1) {
modifier = noun.substring(0, noun.indexOf(splitted.get(1)) + splitted.get(1).length());
} else {
modifier = noun.substring(0, noun.length() - head.length());
}
// System.out.println(splitted + " modifier2=" + modifier + ", head2=" + head);
CompoundDBEntry compound2 = createCompoundDBEntry(modifier, head, id);
// CompoundDeterminer.writeFile(logFile, "2: " + compound2.getModifier() + " " + compound2.getHypernym() +
// " (isInGN=" + compound2.isModifierInGermaNet()
// + ", headIsInGN=" + compound2.isHeadInGermaNet() + ")\n", true);
if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet()
&& compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) {
if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) {
compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 > headDistance2)");
return compound1;
} else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) {
compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (headDistance1 < headDistance2)");
return compound2;
} else if (compound1.getModifierHypernymDistance() > compound2.getModifierHypernymDistance()) {
compound1.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 > modifierDistance2)");
return compound1;
} else if (compound1.getModifierHypernymDistance() < compound2.getModifierHypernymDistance()) {
compound2.setComment("ASV: 3 constituents, modifier and head are in GermaNet (modifierDistance1 < modifierDistance2)");
return compound2;
}
CompoundDeterminer.twoBinaryCompoundsAreInGN++;
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds are in GN\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
} else if (compound1.isModifierInGermaNet() && compound1.isHeadInGermaNet()) { // ist dieses IF doppelt?
compound1.setComment("ASV: 3 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
return compound1;
} else if (compound2.isModifierInGermaNet() && compound2.isHeadInGermaNet()) {
compound2.setComment("ASV: 3 constituents, both parts in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
return compound2;
} else if (compound1.isHeadInGermaNet() && compound2.isHeadInGermaNet()) {
if (compound1.getHeadHypernymDistance() > compound2.getHeadHypernymDistance()) {
return compound1;
} else if (compound1.getHeadHypernymDistance() < compound2.getHeadHypernymDistance()) {
return compound2;
}
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + ": two possible compounds whose heads are in GN\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
} else if (compound1.isHeadInGermaNet()) {
compound1.setComment("ASV: 3 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound1.toSQLString() + "\n", true);
return compound1;
} else if (compound2.isHeadInGermaNet()) {
compound2.setComment("ASV: 3 constituents, head in GN");
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, compound2.toSQLString() + "\n", true);
return compound2;
} else {
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " not correctly splitted: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: not correctly splitted: " + splitted);
}
} else if (splitted.size() > 3) {
// System.out.println(noun + "\t" + splitted);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, noun + " has more than 3 constituents: " + splitted + "\n", true);
return new CompoundDBEntry("", "", "", id, "ASV: has more than 3 constituents: " + splitted);
}
return new CompoundDBEntry("", "", "", id, "");
}
private CompoundDBEntry createCompoundDBEntry(String modifier, String head, int compoundId) {
int modifierId = -1;
int headId = -1;
String comment = "ASV";
boolean modifierIsInGermaNet = true;
boolean headIsInGermaNet = true;
int modifierHypernymDistance = -1;
int headHypernymDistance = -1;
String compositionalType = "\\N";
LexUnit compound = germaNet.getLexUnitByID(compoundId);
DisambiguatedHypernym modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier, germaNet);
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier);
if (lexUnits.isEmpty()) {
// try adding modifier-e
modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier + "e", germaNet);
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier + "e");
}
if (modifierHypernym != null || lexUnits.size() > 0) {
modifier += "e";
compositionalType = CompoundDBEntry.COMPOSITIONAL_TYPE_MODIFIER_E;
} else {
// try first letter of modifier in lower case
modifierHypernym = CompoundDisambiguator.disambiguateHypernym(compound, modifier.substring(0, 1).toLowerCase() + modifier.substring(1), germaNet);
if (modifierHypernym == null) {
lexUnits = germaNet.getLexUnits(modifier.substring(0, 1).toLowerCase() + modifier.substring(1));
}
if (modifierHypernym != null || lexUnits.size() > 0) {
modifier = modifier.substring(0, 1).toLowerCase() + modifier.substring(1);
}
}
}
}
if (modifierHypernym == null && lexUnits.isEmpty()) {
modifierIsInGermaNet = false;
} else if (modifierHypernym != null) {
modifierId = modifierHypernym.getHypernym().getId();
modifierHypernymDistance = modifierHypernym.getDistance();
} else if (lexUnits.size() == 1) {
modifierId = lexUnits.get(0).getId();
}
// try first character of head in upper case
DisambiguatedHypernym headHypernym = CompoundDisambiguator.disambiguateHypernym(compound,
head.substring(0, 1).toUpperCase() + head.substring(1), germaNet);
lexUnits = new ArrayList<LexUnit>();
if (headHypernym == null) {
lexUnits = germaNet.getLexUnits(head.substring(0, 1).toUpperCase() + head.substring(1));
if (lexUnits.isEmpty()) {
// try as it was
headHypernym = CompoundDisambiguator.disambiguateHypernym(compound, head, germaNet);
if (headHypernym == null) {
lexUnits = germaNet.getLexUnits(head);
}
} else {
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
} else {
head = head.substring(0, 1).toUpperCase() + head.substring(1);
}
if (headHypernym == null && lexUnits.isEmpty()) {
headIsInGermaNet = false;
}
if (headHypernym != null) {
headId = headHypernym.getHypernym().getId();
headHypernymDistance = headHypernym.getDistance();
} else if (lexUnits.size() == 1) {
headId = lexUnits.get(0).getId();
}
return new CompoundDBEntry(compound.getOrthForm(), modifier, modifierId,
head, headId, compoundId, comment, modifierIsInGermaNet, headIsInGermaNet,
modifierHypernymDistance, headHypernymDistance);
}
public void setFileEnding(String fileEnding) throws IOException {
this.fileEnding = fileEnding;
CompoundDeterminer.writeFile(compoundsDeterminedWithASVFile + fileEnding, "", false);
CompoundDeterminer.writeFile(nonCompoundsDeterminedWithASVFile + fileEnding, "", false);
}
}
This diff is collapsed.
This diff is collapsed.
package decompounder;
import de.tuebingen.uni.sfs.germanet.dbapi.ConRelType;
import de.tuebingen.uni.sfs.germanet.dbapi.GermaNet;
import de.tuebingen.uni.sfs.germanet.dbapi.LexRelType;
import de.tuebingen.uni.sfs.germanet.dbapi.LexUnit;
import de.tuebingen.uni.sfs.germanet.dbapi.Synset;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Verena Henrich
*/
public class CompoundDisambiguator {
/**
* Return hypernym of compound, i.e., correct reading of potentialHypernym.
* Return null if potentialHypernym is no hypernym of compound.
*
* @param compound
* @param potentialHypernym
* @param germaNet
* @return
*/
public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, String potentialHypernym, GermaNet germaNet) {
List<String> potentialHypernymList = new ArrayList<String>();
potentialHypernymList.add(potentialHypernym);
return disambiguateHypernym(compound, potentialHypernymList, germaNet);
}
// public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, List<String> potentialHypernyms, GermaNet germaNet) {
// return disambiguateHypernym(compound, potentialHypernyms, germaNet, 1000);
// }
public static DisambiguatedHypernym disambiguateHypernym(LexUnit compound, List<String> potentialHypernyms, GermaNet germaNet) {
LexUnit hypernym = null;
int distance = -1;
List<LexUnit> lexUnits = new ArrayList<LexUnit>();
for (String potentialHead : potentialHypernyms) {
lexUnits.addAll(germaNet.getLexUnits(potentialHead));
}
int i = 1, j = 1;
List<List<Synset>> allHypernyms = germaNet.getLexUnitByID(compound.getId()).getSynset().getAllHyperonyms();
for (List<Synset> list : allHypernyms) {
if (hypernym != null) {
break;
}
for (Synset synset : list) {
boolean matchInSameSynset = false;
// System.out.println("i=" + i + ", j=" + j + ", " + synset.getAllOrthForms());
for (LexUnit lexUnit : synset.getLexUnits()) {
if (potentialHypernyms.contains(lexUnit.getOrthForm())) {
// System.out.println("match on level " + i);
if (hypernym == null) {
hypernym = lexUnit;
distance = i;
matchInSameSynset = true;
} else if (hypernym != null
&& (hypernym.getOrthForm().length() < lexUnit.getOrthForm().length())) {
// System.err.println(compound.getOrthForm() + ": " + lexUnit.getOrthForm()
// + " is new hypernym, instead of " + hypernym.getOrthForm());
hypernym = lexUnit;
distance = i;
}
if (!matchInSameSynset && hypernym != null) {
System.err.println(compound.getOrthForm() + " " + compound.getId() + ": two hypernyms possible ("
+ hypernym.getOrthForm() + " and " + lexUnit.getOrthForm() + ")");
}
}
}
j++;
}
i++;
// if (i > maxDistance) {
// break;
// }
}
if (hypernym != null) {
return new DisambiguatedHypernym(hypernym, distance);
} else {
return null;
}
}
public static class DisambiguatedHypernym {
private LexUnit hypernym;
private int distance;
public DisambiguatedHypernym(LexUnit head, int distance) {
this.hypernym = head;
this.distance = distance;
}
public LexUnit getHypernym() {
return hypernym;
}
public int getDistance() {
return distance;
}
}
public static LexUnit disambiguatePWRelation(LexUnit compound, List<String> potentialPWRelatedLexUnits, GermaNet germaNet) {
List<Synset> synsetsInPWR = compound.getSynset().getRelatedSynsets(ConRelType.HAS_COMPONENT_HOLONYM);
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_COMPONENT_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_MEMBER_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_MEMBER_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_PORTION_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_PORTION_MERONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_SUBSTANCE_HOLONYM));
synsetsInPWR.addAll(compound.getSynset().getRelatedSynsets(ConRelType.HAS_SUBSTANCE_MERONYM));
for (Synset synsetInPWR : synsetsInPWR) {
for (LexUnit lexUnit : synsetInPWR.getLexUnits()) {
if (potentialPWRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
}
return null;
}
/**
* considers all relations except part-whole relations and hypernyms
*
* @param compound
* @param potentialRelatedLexUnits
* @param germaNet
* @return
*/
public static LexUnit disambiguateRelation(LexUnit compound, List<String> potentialRelatedLexUnits, GermaNet germaNet) {
List<Synset> relatedSynsets = compound.getSynset().getRelatedSynsets(ConRelType.IS_ENTAILED_BY);
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.IS_RELATED_TO));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.CAUSES));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(ConRelType.ENTAILS));
relatedSynsets.addAll(compound.getSynset().getRelatedSynsets(GermaNet.HAS_HYPONYM));
for (Synset synsetInPWR : relatedSynsets) {
for (LexUnit lexUnit : synsetInPWR.getLexUnits()) {
if (potentialRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
}
List<LexUnit> relatedLexUnits = compound.getRelatedLexUnits(LexRelType.HAS_ANTONYM);
relatedLexUnits.addAll(compound.getRelatedLexUnits(LexRelType.HAS_PERTAINYM));
for (LexUnit lexUnit : relatedLexUnits) {
if (potentialRelatedLexUnits.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
return null;
}
public static LexUnit disambiguateSynonym(LexUnit compound, List<String> potentialSynonyms, GermaNet germaNet) {
for (LexUnit lexUnit : compound.getSynonyms()) {
if (potentialSynonyms.contains(lexUnit.getOrthForm())
&& lexUnit.getOrthForm().length() < compound.getOrthForm().length() - 3) {
return lexUnit;
}
}
return null;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment