Commit 38cdccff authored by Wei Qiu's avatar Wei Qiu
Browse files

Change how models are loaded

parent dc00d935
......@@ -16,4 +16,5 @@ hs_err_pid*
*.swp
target/
.idea/
.settings/
......@@ -10,4 +10,7 @@ server:
logging:
appenders:
- type: ${LOG_APPENDER!'console'}
\ No newline at end of file
- type: ${LOG_APPENDER!'console'}
# possible langs: bg cs de en et fa fr hu ko pl ro sk sl sr sv
langs: ["de", "en", "fr"]
......@@ -27,7 +27,7 @@ public class MarmotApplication extends Application<MarmotConfiguration> {
@Override
public void run(final MarmotConfiguration configuration,
final Environment environment) {
MarmotResource marmotResource = new MarmotResource();
MarmotResource marmotResource = new MarmotResource(configuration.getLangs());
IndexResource indexResource = new IndexResource();
environment.jersey().register(marmotResource);
environment.jersey().register(indexResource);
......
......@@ -6,5 +6,16 @@ import org.hibernate.validator.constraints.*;
import javax.validation.constraints.*;
public class MarmotConfiguration extends Configuration {
// TODO: implement service configuration
@NotNull
private String[] langs;
public String[] getLangs() {
return langs;
}
@JsonProperty
public void setLangs(String[] langs) {
this.langs = langs;
}
}
......@@ -8,12 +8,12 @@ import lemming.lemma.Lemmatizer;
import marmot.morph.*;
import marmot.morph.Sentence;
import marmot.util.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.io.UnsupportedEncodingException;
import java.util.*;
/**
* @author sunny ha
......@@ -21,10 +21,28 @@ import java.util.List;
*/
public class MarmotTool implements TextCorpusProcessor {
private static final Logger LOGGER = LoggerFactory.getLogger(MarmotTool.class);
private static final EnumSet<TextCorpusLayerTag> requiredLayers =
EnumSet.of(TextCorpusLayerTag.SENTENCES);
private Set<String> supportedLangs = new HashSet<>();
MorphOptions options = new MorphOptions();
HashMap<String, MorphTagger> taggerMap = new HashMap<>();
// load the models
public MarmotTool(String[] langs) {
for(String lang : langs) {
LOGGER.info("Loading model for {}", lang);
supportedLangs.add(lang);
ClassLoader classLoader = getClass().getClassLoader();
InputStream modelStream = classLoader.getResourceAsStream("models/" + lang + ".marmot");
MorphTagger tagger = FileUtils.loadFromStream(modelStream);
taggerMap.put(lang, tagger);
}
}
@Override
public EnumSet<TextCorpusLayerTag> getRequiredLayers() {
return requiredLayers;
......@@ -42,66 +60,57 @@ public class MarmotTool implements TextCorpusProcessor {
//for current purpose we use only german model
//get model language of the token
String lang = textCorpus.getLanguage();
if (!supportedLangs.contains(lang)) {
throw new TextCorpusProcessorException("Unsupported language");
}
try {
//load the model of the language accordingly
ClassLoader classLoader = getClass().getClassLoader();
InputStream modelStream = classLoader.getResourceAsStream("models/" + lang + ".marmot");
MorphTagger tagger = FileUtils.loadFromStream(modelStream);
String lemmatizer_file = options.getLemmatizerFile();
if (!lemmatizer_file.isEmpty()) {
Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
tagger.setPipeLineLemmatizer(lemmatizer);
}
if (!options.getMorphDict().isEmpty()) {
MorphWeightVector vector = (MorphWeightVector) tagger.getWeightVector();
MorphDictionary dict = vector.getMorphDict();
if (dict != null) {
dict.addWordsFromFile(options.getMorphDict());
} else {
System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
}
}
//create pos and morph layers to tag the data to
PosTagsLayer posTagsLayer = textCorpus.createPosTagsLayer(lang);
MorphologyLayer morphLayer = textCorpus.createMorphologyLayer();
List<Word> sentenceList;
//take the sentence layer and for each sentence, get the tokens list from it
//add the tokens to the sentence list and cast it to marmot sentence class then tag it
// add the pos and morph tag for each token within the sentence
//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
for (int i =0; i < sentencesLayer.size(); i++){
eu.clarin.weblicht.wlfxb.tc.api.Sentence sent = sentencesLayer.getSentence(i);
Token[] tokens = sentencesLayer.getTokens(sent);
sentenceList = new ArrayList<>();
for (Token t : tokens){
sentenceList.add(new Word(t.getString()));
}
Sentence sentence = new Sentence(sentenceList);
lemma_tags= tagger.tagWithLemma(sentence);
List<Feature> feats;
String[] morph;
for (int j =0; j<tokens.length; j++){
posTagsLayer.addTag(lemma_tags.get(j).get(1), tokens[j]);
morph = lemma_tags.get(j).get(2).split("\\|");
feats = new ArrayList<>();
for (String str : morph){
//what shall I name the values?
feats.add(morphLayer.createFeature(str, str));
}
morphLayer.addAnalysis(tokens[j], feats);
}
}
//System.out.println(lemma_tags);
//for (int i= 0; i < lemma_tags.size(); i++) {
//System.out.println(String.format("token: %s : posTag %s", tokensLayer.getToken(i), lemma_tags.get(i)));
//}
} finally {
}
String lemmatizer_file = options.getLemmatizerFile();
if (!lemmatizer_file.isEmpty()) {
Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
taggerMap.get(lang).setPipeLineLemmatizer(lemmatizer);
}
if (!options.getMorphDict().isEmpty()) {
MorphWeightVector vector = (MorphWeightVector) taggerMap.get(lang).getWeightVector();
MorphDictionary dict = vector.getMorphDict();
if (dict != null) {
dict.addWordsFromFile(options.getMorphDict());
} else {
System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
}
}
//create pos and morph layers to tag the data to
PosTagsLayer posTagsLayer = textCorpus.createPosTagsLayer(lang);
MorphologyLayer morphLayer = textCorpus.createMorphologyLayer();
List<Word> sentenceList;
//take the sentence layer and for each sentence, get the tokens list from it
//add the tokens to the sentence list and cast it to marmot sentence class then tag it
// add the pos and morph tag for each token within the sentence
//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
for (int i =0; i < sentencesLayer.size(); i++){
eu.clarin.weblicht.wlfxb.tc.api.Sentence sent = sentencesLayer.getSentence(i);
Token[] tokens = sentencesLayer.getTokens(sent);
sentenceList = new ArrayList<>();
for (Token t : tokens){
sentenceList.add(new Word(t.getString()));
}
Sentence sentence = new Sentence(sentenceList);
lemma_tags= taggerMap.get(lang).tagWithLemma(sentence);
List<Feature> feats;
String[] morph;
for (int j =0; j<tokens.length; j++){
posTagsLayer.addTag(lemma_tags.get(j).get(1), tokens[j]);
morph = lemma_tags.get(j).get(2).split("\\|");
feats = new ArrayList<>();
for (String str : morph){
//what shall I name the values?
feats.add(morphLayer.createFeature(str, str));
}
morphLayer.addAnalysis(tokens[j], feats);
}
}
}
}
......
......@@ -5,6 +5,8 @@ import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.ws.rs.*;
import javax.ws.rs.core.MediaType;
......@@ -12,21 +14,20 @@ import javax.ws.rs.core.Response;
import javax.ws.rs.core.StreamingOutput;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
@Path("annotate")
public class MarmotResource {
private static final Logger LOGGER = LoggerFactory.getLogger(MarmotResource.class);
private static final String TEXT_TCF_XML = "text/tcf+xml";
private static final String FALL_BACK_MESSAGE = "Data processing failed";
private static final String TEMP_FILE_PREFIX = "references-output-temp";
private static final String TEMP_FILE_SUFFIX = ".xml";
private TextCorpusProcessor marmotTool;
public MarmotResource() {
marmotTool = new MarmotTool();
public MarmotResource(String[] langs) {
marmotTool = new MarmotTool(langs);
}
@Path("marmot/bytes")
......@@ -118,7 +119,7 @@ public class MarmotResource {
if (message == null) {
message = FALL_BACK_MESSAGE;
}
Logger.getLogger(this.getClass().getName()).log(Level.SEVERE, message, ex);
LOGGER.error("Failed {}", message, ex);
return Response.status(status).entity(message).type(MediaType.TEXT_PLAIN).build();
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment