Change how models are loaded

38cdccff · Wei Qiu · dc00d935 · 38cdccff · 38cdccff · 38cdccff
Commit 38cdccff authored Oct 26, 2017 by Wei Qiu
6 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,5 @@ hs_err_pid*
 *.swp
 target/
 .idea/
+.settings/

--- a/src/assembly/conf/service.yml
+++ b/src/assembly/conf/service.yml
@@ -10,4 +10,7 @@ server:

 logging:
  appenders:
-    - type: ${LOG_APPENDER!'console'}
\ No newline at end of file
+    - type: ${LOG_APPENDER!'console'}
+
+# possible langs: bg cs de en et fa fr hu ko pl ro sk sl sr sv
+langs: ["de", "en", "fr"]
--- a/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotApplication.java
+++ b/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotApplication.java
@@ -27,7 +27,7 @@ public class MarmotApplication extends Application<MarmotConfiguration> {
    @Override
    public void run(final MarmotConfiguration configuration,
                    final Environment environment) {
-        MarmotResource marmotResource = new MarmotResource();
+        MarmotResource marmotResource = new MarmotResource(configuration.getLangs());
        IndexResource indexResource = new IndexResource();
        environment.jersey().register(marmotResource);
        environment.jersey().register(indexResource);

--- a/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotConfiguration.java
+++ b/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotConfiguration.java
@@ -6,5 +6,16 @@ import org.hibernate.validator.constraints.*;
 import javax.validation.constraints.*;

 public class MarmotConfiguration extends Configuration {
-    // TODO: implement service configuration
+
+    @NotNull
+    private String[] langs;
+
+    public String[] getLangs() {
+        return langs;
+    }
+
+    @JsonProperty
+    public void setLangs(String[] langs) {
+        this.langs = langs;
+    }
 }
--- a/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/core/MarmotTool.java
+++ b/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/core/MarmotTool.java
@@ -8,12 +8,12 @@ import lemming.lemma.Lemmatizer;
 import marmot.morph.*;
 import marmot.morph.Sentence;
 import marmot.util.FileUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

-import java.io.File;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.EnumSet;
-import java.util.List;
+import java.io.UnsupportedEncodingException;
+import java.util.*;

 /**
 * @author sunny ha
@@ -21,10 +21,28 @@ import java.util.List;
 */

 public class MarmotTool implements TextCorpusProcessor {
+	private static final Logger LOGGER = LoggerFactory.getLogger(MarmotTool.class);
+
    private static final EnumSet<TextCorpusLayerTag> requiredLayers =
            EnumSet.of(TextCorpusLayerTag.SENTENCES);
+
+    private Set<String> supportedLangs = new HashSet<>();
+
    MorphOptions options = new MorphOptions();
-   
+    HashMap<String, MorphTagger> taggerMap = new HashMap<>();
+
+	// load the models
+	public MarmotTool(String[] langs) {
+		for(String lang : langs) {
+            LOGGER.info("Loading model for {}", lang);
+            supportedLangs.add(lang);
+			ClassLoader classLoader = getClass().getClassLoader();
+			InputStream modelStream = classLoader.getResourceAsStream("models/" + lang + ".marmot");
+			MorphTagger tagger = FileUtils.loadFromStream(modelStream);
+			taggerMap.put(lang, tagger);
+		}
+	}
+
    @Override
    public EnumSet<TextCorpusLayerTag> getRequiredLayers() {
        return requiredLayers;
@@ -42,66 +60,57 @@ public class MarmotTool implements TextCorpusProcessor {
    	//for current purpose we use only german model
 		//get model language of the token
 		String lang = textCorpus.getLanguage();
+        if (!supportedLangs.contains(lang)) {
+            throw new TextCorpusProcessorException("Unsupported language");
+        }
 		

-    	try {
-    		//load the model of the language accordingly
-    		ClassLoader classLoader = getClass().getClassLoader();
-			InputStream modelStream = classLoader.getResourceAsStream("models/" + lang + ".marmot");
-    		MorphTagger tagger = FileUtils.loadFromStream(modelStream);
-    		String lemmatizer_file = options.getLemmatizerFile();
-    		if (!lemmatizer_file.isEmpty()) {
-    			Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
-    			tagger.setPipeLineLemmatizer(lemmatizer);
-    		}
-    		if (!options.getMorphDict().isEmpty()) {
-    			MorphWeightVector vector = (MorphWeightVector) tagger.getWeightVector();
-    			MorphDictionary dict = vector.getMorphDict();
-    			if (dict != null) {
-    				dict.addWordsFromFile(options.getMorphDict());
-    			} else {
-    				System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
-    			}
-    		}
-    		//create pos and morph layers to tag the data to
-    		PosTagsLayer posTagsLayer = textCorpus.createPosTagsLayer(lang);
-        	MorphologyLayer morphLayer = textCorpus.createMorphologyLayer();
-        	
-    		List<Word> sentenceList;
-			//take the sentence layer and for each sentence, get the tokens list from it
-    		//add the tokens to the sentence list and cast it to marmot sentence class then tag it
-    		// add the pos and morph tag for each token within the sentence
-    		//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
-    		for (int i =0; i < sentencesLayer.size(); i++){
-				eu.clarin.weblicht.wlfxb.tc.api.Sentence sent = sentencesLayer.getSentence(i);
-				Token[] tokens = sentencesLayer.getTokens(sent);
-				sentenceList = new ArrayList<>();
-				for (Token t : tokens){
-					sentenceList.add(new Word(t.getString()));
-				}
-				Sentence sentence = new Sentence(sentenceList);
-				lemma_tags= tagger.tagWithLemma(sentence);
-				List<Feature> feats;
-				String[] morph;
-				for (int j =0; j<tokens.length; j++){
-					posTagsLayer.addTag(lemma_tags.get(j).get(1), tokens[j]);
-					morph = lemma_tags.get(j).get(2).split("\\|");
-					feats = new ArrayList<>();
-					for (String str : morph){
-						//what shall I name the values?
-						feats.add(morphLayer.createFeature(str, str));	
-					}
-					morphLayer.addAnalysis(tokens[j], feats);
-				}
-			}
-			
-			//System.out.println(lemma_tags); 
-			//for (int i= 0; i < lemma_tags.size(); i++) {
-				//System.out.println(String.format("token: %s : posTag %s", tokensLayer.getToken(i), lemma_tags.get(i)));
-			//}
-				
-    	} finally {
-    	}
+        String lemmatizer_file = options.getLemmatizerFile();
+        if (!lemmatizer_file.isEmpty()) {
+            Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
+            taggerMap.get(lang).setPipeLineLemmatizer(lemmatizer);
+        }
+        if (!options.getMorphDict().isEmpty()) {
+            MorphWeightVector vector = (MorphWeightVector) taggerMap.get(lang).getWeightVector();
+            MorphDictionary dict = vector.getMorphDict();
+            if (dict != null) {
+                dict.addWordsFromFile(options.getMorphDict());
+            } else {
+                System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
+            }
+        }
+        //create pos and morph layers to tag the data to
+        PosTagsLayer posTagsLayer = textCorpus.createPosTagsLayer(lang);
+        MorphologyLayer morphLayer = textCorpus.createMorphologyLayer();
+
+        List<Word> sentenceList;
+        //take the sentence layer and for each sentence, get the tokens list from it
+        //add the tokens to the sentence list and cast it to marmot sentence class then tag it
+        // add the pos and morph tag for each token within the sentence
+        //most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
+        for (int i =0; i < sentencesLayer.size(); i++){
+            eu.clarin.weblicht.wlfxb.tc.api.Sentence sent = sentencesLayer.getSentence(i);
+            Token[] tokens = sentencesLayer.getTokens(sent);
+            sentenceList = new ArrayList<>();
+            for (Token t : tokens){
+                sentenceList.add(new Word(t.getString()));
+            }
+            Sentence sentence = new Sentence(sentenceList);
+            lemma_tags= taggerMap.get(lang).tagWithLemma(sentence);
+            List<Feature> feats;
+            String[] morph;
+            for (int j =0; j<tokens.length; j++){
+                posTagsLayer.addTag(lemma_tags.get(j).get(1), tokens[j]);
+                morph = lemma_tags.get(j).get(2).split("\\|");
+                feats = new ArrayList<>();
+                for (String str : morph){
+                    //what shall I name the values?
+                    feats.add(morphLayer.createFeature(str, str));
+                }
+                morphLayer.addAnalysis(tokens[j], feats);
+            }
+        }
+
    }
 }
    

--- a/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/resources/MarmotResource.java
+++ b/src/main/java/de/tuebingen/uni/sfs/clarind/marmot/resources/MarmotResource.java
@@ -5,6 +5,8 @@ import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
 import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException;
 import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
 import eu.clarin.weblicht.wlfxb.io.WLFormatException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import javax.ws.rs.*;
 import javax.ws.rs.core.MediaType;
@@ -12,21 +14,20 @@ import javax.ws.rs.core.Response;
 import javax.ws.rs.core.StreamingOutput;

 import java.io.*;
-import java.util.logging.Level;
-import java.util.logging.Logger;


 @Path("annotate")
 public class MarmotResource {

+    private static final Logger LOGGER = LoggerFactory.getLogger(MarmotResource.class);
    private static final String TEXT_TCF_XML = "text/tcf+xml";
    private static final String FALL_BACK_MESSAGE = "Data processing failed";
    private static final String TEMP_FILE_PREFIX = "references-output-temp";
    private static final String TEMP_FILE_SUFFIX = ".xml";

    private TextCorpusProcessor marmotTool;
-    public MarmotResource()  {
-        marmotTool = new MarmotTool();
+    public MarmotResource(String[] langs)  {
+        marmotTool = new MarmotTool(langs);
    }
    
    @Path("marmot/bytes")
@@ -118,7 +119,7 @@ public class MarmotResource {
        if (message == null) {
            message = FALL_BACK_MESSAGE;
        }
-        Logger.getLogger(this.getClass().getName()).log(Level.SEVERE, message, ex);
+        LOGGER.error("Failed {}", message, ex);
        return Response.status(status).entity(message).type(MediaType.TEXT_PLAIN).build();
    }
 }