Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
clarind
wl-webservices
service-marmot
Commits
38cdccff
Commit
38cdccff
authored
Oct 26, 2017
by
Wei Qiu
Browse files
Change how models are loaded
parent
dc00d935
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
96 additions
and
71 deletions
+96
-71
.gitignore
.gitignore
+1
-0
src/assembly/conf/service.yml
src/assembly/conf/service.yml
+4
-1
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotApplication.java
...e/tuebingen/uni/sfs/clarind/marmot/MarmotApplication.java
+1
-1
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotConfiguration.java
...tuebingen/uni/sfs/clarind/marmot/MarmotConfiguration.java
+12
-1
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/core/MarmotTool.java
.../de/tuebingen/uni/sfs/clarind/marmot/core/MarmotTool.java
+72
-63
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/resources/MarmotResource.java
...ngen/uni/sfs/clarind/marmot/resources/MarmotResource.java
+6
-5
No files found.
.gitignore
View file @
38cdccff
...
...
@@ -16,4 +16,5 @@ hs_err_pid*
*.swp
target/
.idea/
.settings/
src/assembly/conf/service.yml
View file @
38cdccff
...
...
@@ -10,4 +10,7 @@ server:
logging
:
appenders
:
-
type
:
${LOG_APPENDER!'console'}
\ No newline at end of file
-
type
:
${LOG_APPENDER!'console'}
# possible langs: bg cs de en et fa fr hu ko pl ro sk sl sr sv
langs
:
[
"
de"
,
"
en"
,
"
fr"
]
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotApplication.java
View file @
38cdccff
...
...
@@ -27,7 +27,7 @@ public class MarmotApplication extends Application<MarmotConfiguration> {
@Override
public
void
run
(
final
MarmotConfiguration
configuration
,
final
Environment
environment
)
{
MarmotResource
marmotResource
=
new
MarmotResource
();
MarmotResource
marmotResource
=
new
MarmotResource
(
configuration
.
getLangs
()
);
IndexResource
indexResource
=
new
IndexResource
();
environment
.
jersey
().
register
(
marmotResource
);
environment
.
jersey
().
register
(
indexResource
);
...
...
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/MarmotConfiguration.java
View file @
38cdccff
...
...
@@ -6,5 +6,16 @@ import org.hibernate.validator.constraints.*;
import
javax.validation.constraints.*
;
public
class
MarmotConfiguration
extends
Configuration
{
// TODO: implement service configuration
@NotNull
private
String
[]
langs
;
public
String
[]
getLangs
()
{
return
langs
;
}
@JsonProperty
public
void
setLangs
(
String
[]
langs
)
{
this
.
langs
=
langs
;
}
}
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/core/MarmotTool.java
View file @
38cdccff
...
...
@@ -8,12 +8,12 @@ import lemming.lemma.Lemmatizer;
import
marmot.morph.*
;
import
marmot.morph.Sentence
;
import
marmot.util.FileUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.io.File
;
import
java.io.InputStream
;
import
java.util.ArrayList
;
import
java.util.EnumSet
;
import
java.util.List
;
import
java.io.UnsupportedEncodingException
;
import
java.util.*
;
/**
* @author sunny ha
...
...
@@ -21,10 +21,28 @@ import java.util.List;
*/
public
class
MarmotTool
implements
TextCorpusProcessor
{
private
static
final
Logger
LOGGER
=
LoggerFactory
.
getLogger
(
MarmotTool
.
class
);
private
static
final
EnumSet
<
TextCorpusLayerTag
>
requiredLayers
=
EnumSet
.
of
(
TextCorpusLayerTag
.
SENTENCES
);
private
Set
<
String
>
supportedLangs
=
new
HashSet
<>();
MorphOptions
options
=
new
MorphOptions
();
HashMap
<
String
,
MorphTagger
>
taggerMap
=
new
HashMap
<>();
// load the models
public
MarmotTool
(
String
[]
langs
)
{
for
(
String
lang
:
langs
)
{
LOGGER
.
info
(
"Loading model for {}"
,
lang
);
supportedLangs
.
add
(
lang
);
ClassLoader
classLoader
=
getClass
().
getClassLoader
();
InputStream
modelStream
=
classLoader
.
getResourceAsStream
(
"models/"
+
lang
+
".marmot"
);
MorphTagger
tagger
=
FileUtils
.
loadFromStream
(
modelStream
);
taggerMap
.
put
(
lang
,
tagger
);
}
}
@Override
public
EnumSet
<
TextCorpusLayerTag
>
getRequiredLayers
()
{
return
requiredLayers
;
...
...
@@ -42,66 +60,57 @@ public class MarmotTool implements TextCorpusProcessor {
//for current purpose we use only german model
//get model language of the token
String
lang
=
textCorpus
.
getLanguage
();
if
(!
supportedLangs
.
contains
(
lang
))
{
throw
new
TextCorpusProcessorException
(
"Unsupported language"
);
}
try
{
//load the model of the language accordingly
ClassLoader
classLoader
=
getClass
().
getClassLoader
();
InputStream
modelStream
=
classLoader
.
getResourceAsStream
(
"models/"
+
lang
+
".marmot"
);
MorphTagger
tagger
=
FileUtils
.
loadFromStream
(
modelStream
);
String
lemmatizer_file
=
options
.
getLemmatizerFile
();
if
(!
lemmatizer_file
.
isEmpty
())
{
Lemmatizer
lemmatizer
=
FileUtils
.
loadFromFile
(
lemmatizer_file
);
tagger
.
setPipeLineLemmatizer
(
lemmatizer
);
}
if
(!
options
.
getMorphDict
().
isEmpty
())
{
MorphWeightVector
vector
=
(
MorphWeightVector
)
tagger
.
getWeightVector
();
MorphDictionary
dict
=
vector
.
getMorphDict
();
if
(
dict
!=
null
)
{
dict
.
addWordsFromFile
(
options
.
getMorphDict
());
}
else
{
System
.
err
.
format
(
"Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n"
);
}
}
//create pos and morph layers to tag the data to
PosTagsLayer
posTagsLayer
=
textCorpus
.
createPosTagsLayer
(
lang
);
MorphologyLayer
morphLayer
=
textCorpus
.
createMorphologyLayer
();
List
<
Word
>
sentenceList
;
//take the sentence layer and for each sentence, get the tokens list from it
//add the tokens to the sentence list and cast it to marmot sentence class then tag it
// add the pos and morph tag for each token within the sentence
//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
for
(
int
i
=
0
;
i
<
sentencesLayer
.
size
();
i
++){
eu
.
clarin
.
weblicht
.
wlfxb
.
tc
.
api
.
Sentence
sent
=
sentencesLayer
.
getSentence
(
i
);
Token
[]
tokens
=
sentencesLayer
.
getTokens
(
sent
);
sentenceList
=
new
ArrayList
<>();
for
(
Token
t
:
tokens
){
sentenceList
.
add
(
new
Word
(
t
.
getString
()));
}
Sentence
sentence
=
new
Sentence
(
sentenceList
);
lemma_tags
=
tagger
.
tagWithLemma
(
sentence
);
List
<
Feature
>
feats
;
String
[]
morph
;
for
(
int
j
=
0
;
j
<
tokens
.
length
;
j
++){
posTagsLayer
.
addTag
(
lemma_tags
.
get
(
j
).
get
(
1
),
tokens
[
j
]);
morph
=
lemma_tags
.
get
(
j
).
get
(
2
).
split
(
"\\|"
);
feats
=
new
ArrayList
<>();
for
(
String
str
:
morph
){
//what shall I name the values?
feats
.
add
(
morphLayer
.
createFeature
(
str
,
str
));
}
morphLayer
.
addAnalysis
(
tokens
[
j
],
feats
);
}
}
//System.out.println(lemma_tags);
//for (int i= 0; i < lemma_tags.size(); i++) {
//System.out.println(String.format("token: %s : posTag %s", tokensLayer.getToken(i), lemma_tags.get(i)));
//}
}
finally
{
}
String
lemmatizer_file
=
options
.
getLemmatizerFile
();
if
(!
lemmatizer_file
.
isEmpty
())
{
Lemmatizer
lemmatizer
=
FileUtils
.
loadFromFile
(
lemmatizer_file
);
taggerMap
.
get
(
lang
).
setPipeLineLemmatizer
(
lemmatizer
);
}
if
(!
options
.
getMorphDict
().
isEmpty
())
{
MorphWeightVector
vector
=
(
MorphWeightVector
)
taggerMap
.
get
(
lang
).
getWeightVector
();
MorphDictionary
dict
=
vector
.
getMorphDict
();
if
(
dict
!=
null
)
{
dict
.
addWordsFromFile
(
options
.
getMorphDict
());
}
else
{
System
.
err
.
format
(
"Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n"
);
}
}
//create pos and morph layers to tag the data to
PosTagsLayer
posTagsLayer
=
textCorpus
.
createPosTagsLayer
(
lang
);
MorphologyLayer
morphLayer
=
textCorpus
.
createMorphologyLayer
();
List
<
Word
>
sentenceList
;
//take the sentence layer and for each sentence, get the tokens list from it
//add the tokens to the sentence list and cast it to marmot sentence class then tag it
// add the pos and morph tag for each token within the sentence
//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
for
(
int
i
=
0
;
i
<
sentencesLayer
.
size
();
i
++){
eu
.
clarin
.
weblicht
.
wlfxb
.
tc
.
api
.
Sentence
sent
=
sentencesLayer
.
getSentence
(
i
);
Token
[]
tokens
=
sentencesLayer
.
getTokens
(
sent
);
sentenceList
=
new
ArrayList
<>();
for
(
Token
t
:
tokens
){
sentenceList
.
add
(
new
Word
(
t
.
getString
()));
}
Sentence
sentence
=
new
Sentence
(
sentenceList
);
lemma_tags
=
taggerMap
.
get
(
lang
).
tagWithLemma
(
sentence
);
List
<
Feature
>
feats
;
String
[]
morph
;
for
(
int
j
=
0
;
j
<
tokens
.
length
;
j
++){
posTagsLayer
.
addTag
(
lemma_tags
.
get
(
j
).
get
(
1
),
tokens
[
j
]);
morph
=
lemma_tags
.
get
(
j
).
get
(
2
).
split
(
"\\|"
);
feats
=
new
ArrayList
<>();
for
(
String
str
:
morph
){
//what shall I name the values?
feats
.
add
(
morphLayer
.
createFeature
(
str
,
str
));
}
morphLayer
.
addAnalysis
(
tokens
[
j
],
feats
);
}
}
}
}
...
...
src/main/java/de/tuebingen/uni/sfs/clarind/marmot/resources/MarmotResource.java
View file @
38cdccff
...
...
@@ -5,6 +5,8 @@ import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
import
eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException
;
import
eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed
;
import
eu.clarin.weblicht.wlfxb.io.WLFormatException
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
javax.ws.rs.*
;
import
javax.ws.rs.core.MediaType
;
...
...
@@ -12,21 +14,20 @@ import javax.ws.rs.core.Response;
import
javax.ws.rs.core.StreamingOutput
;
import
java.io.*
;
import
java.util.logging.Level
;
import
java.util.logging.Logger
;
@Path
(
"annotate"
)
public
class
MarmotResource
{
private
static
final
Logger
LOGGER
=
LoggerFactory
.
getLogger
(
MarmotResource
.
class
);
private
static
final
String
TEXT_TCF_XML
=
"text/tcf+xml"
;
private
static
final
String
FALL_BACK_MESSAGE
=
"Data processing failed"
;
private
static
final
String
TEMP_FILE_PREFIX
=
"references-output-temp"
;
private
static
final
String
TEMP_FILE_SUFFIX
=
".xml"
;
private
TextCorpusProcessor
marmotTool
;
public
MarmotResource
()
{
marmotTool
=
new
MarmotTool
();
public
MarmotResource
(
String
[]
langs
)
{
marmotTool
=
new
MarmotTool
(
langs
);
}
@Path
(
"marmot/bytes"
)
...
...
@@ -118,7 +119,7 @@ public class MarmotResource {
if
(
message
==
null
)
{
message
=
FALL_BACK_MESSAGE
;
}
L
ogger
.
getLogger
(
this
.
getClass
().
getName
()).
log
(
Level
.
SEVERE
,
message
,
ex
);
L
OGGER
.
error
(
"Failed {}"
,
message
,
ex
);
return
Response
.
status
(
status
).
entity
(
message
).
type
(
MediaType
.
TEXT_PLAIN
).
build
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment