Commit 9282250a authored by Sunny Ha's avatar Sunny Ha
Browse files

initial commit - pom.xml not updated yet

    de.tuebingen.uni.sfs.clarind: DEBUG
parents
#Created by .ignore support plugin (hsz.mobi)
### Java template
*.class
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.ear
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
.DS_Store
.classpath
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding//src/test/resources=UTF-8
encoding/<project>=UTF-8
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.8
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
# marmot
How to start the marmot application
---
1. Run `mvn clean install` to build your application
1. Start application with `java -jar target/service-marmot-1.0-SNAPSHOT.jar server config.yml`
1. To check that your application is running enter url `http://localhost:8080`
Health Check
---
To see your applications health enter url `http://localhost:8081/healthcheck`
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<prerequisites>
<maven>3.0.0</maven>
</prerequisites>
<groupId>de.tuebingen.uni.sfs.clarind</groupId>
<artifactId>service-marmot</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>marmot</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<dropwizard.version>1.1.0</dropwizard.version>
<mainClass>de.tuebingen.uni.sfs.clarind.marmotApplication</mainClass>
</properties>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>io.dropwizard</groupId>
<artifactId>dropwizard-bom</artifactId>
<version>${dropwizard.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<groupId>io.dropwizard</groupId>
<artifactId>dropwizard-core</artifactId>
</dependency>
<dependency>
<groupId>de.thomaskrille</groupId>
<artifactId>dropwizard-template-config</artifactId>
<version>1.5.0</version>
</dependency>
<dependency>
<groupId>eu.clarin.weblicht</groupId>
<artifactId>wlfxb</artifactId>
<version>1.3.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>${mainClass}</mainClass>
</transformer>
</transformers>
<!-- exclude signed Manifests -->
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>2.6</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>${mainClass}</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>2.8.1</version>
<configuration>
<dependencyLocationsEnabled>false</dependencyLocationsEnabled>
<dependencyDetailsEnabled>false</dependencyDetailsEnabled>
</configuration>
</plugin>
<plugin>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
</plugin>
</plugins>
</reporting>
</project>
package de.tuebingen.uni.sfs.clarind;
import io.dropwizard.Application;
import io.dropwizard.setup.Bootstrap;
import io.dropwizard.setup.Environment;
import de.thomaskrille.dropwizard_template_config.TemplateConfigBundle;
import de.tuebingen.uni.sfs.clarind.resources.MarmotResource;
import de.tuebingen.uni.sfs.clarind.resources.IndexResource;
public class MarmotApplication extends Application<MarmotConfiguration> {
public static void main(final String[] args) throws Exception {
new MarmotApplication().run(args);
}
@Override
public String getName() {
return "service-marmot";
}
@Override
public void initialize(final Bootstrap<MarmotConfiguration> bootstrap) {
bootstrap.addBundle(new TemplateConfigBundle());
}
@Override
public void run(final MarmotConfiguration configuration,
final Environment environment) {
MarmotResource marmotResource = new MarmotResource();
IndexResource indexResource = new IndexResource();
environment.jersey().register(marmotResource);
environment.jersey().register(indexResource);
}
}
package de.tuebingen.uni.sfs.clarind;
import io.dropwizard.Configuration;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.hibernate.validator.constraints.*;
import javax.validation.constraints.*;
public class MarmotConfiguration extends Configuration {
// TODO: implement service configuration
}
package de.tuebingen.uni.sfs.clarind.core;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
import eu.clarin.weblicht.wlfxb.tc.api.*;
import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Response;
import lemming.lemma.Lemmatizer;
import marmot.core.Sequence;
import marmot.morph.MorphDictionary;
import marmot.morph.MorphOptions;
import marmot.morph.MorphTagger;
import marmot.morph.MorphWeightVector;
import marmot.morph.Sentence;
import marmot.morph.Word;
import marmot.morph.cmd.Annotator;
import marmot.morph.io.FileOptions;
import marmot.morph.io.SentenceReader;
import marmot.morph.io.SentenceTextReader;
import marmot.util.FileUtils;
import marmot.util.Sys;
/**
* @author sunny ha
* Takes SentencesLayer and tags PoS and Morph layer using Marmot api
*/
public class MarmotTool implements TextCorpusProcessor {
private static final EnumSet<TextCorpusLayerTag> requiredLayers =
EnumSet.of(TextCorpusLayerTag.SENTENCES);
MorphOptions options = new MorphOptions();
@Override
public EnumSet<TextCorpusLayerTag> getRequiredLayers() {
return requiredLayers;
}
//utilize the marmot api and annotate
//assume the token will be taken from the TCF file and you will process(do the marmot annotation) the TextCorpus
//which will be ready to spit out annotations to different layers
public void process(TextCorpus textCorpus) throws TextCorpusProcessorException {
//get token layer to be processed via annotator
TokensLayer tokensLayer = textCorpus.getTokensLayer();
SentencesLayer sentencesLayer = textCorpus.getSentencesLayer();
List<List<String>> lemma_tags =null;
//for current purpose we use only german model
//get model language of the token
String lang = textCorpus.getLanguage();
try {
//load the model of the language accordingly
ClassLoader classLoader = getClass().getClassLoader();
File file = new File(classLoader.getResource("models/" + lang + ".marmot").getFile());
MorphTagger tagger = FileUtils.loadFromFile(file);
String lemmatizer_file = options.getLemmatizerFile();
if (!lemmatizer_file.isEmpty()) {
Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
tagger.setPipeLineLemmatizer(lemmatizer);
}
if (!options.getMorphDict().isEmpty()) {
MorphWeightVector vector = (MorphWeightVector) tagger.getWeightVector();
MorphDictionary dict = vector.getMorphDict();
if (dict != null) {
dict.addWordsFromFile(options.getMorphDict());
} else {
System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
}
}
//create pos and morph layers to tag the data to
PosTagsLayer posTagsLayer = textCorpus.createPosTagsLayer(lang);
MorphologyLayer morphLayer = textCorpus.createMorphologyLayer();
List<Word> sentenceList;
//take the sentence layer and for each sentence, get the tokens list from it
//add the tokens to the sentence list and cast it to marmot sentence class then tag it
// add the pos and morph tag for each token within the sentence
//most, if not all, tokens have more than one morphological features so create a list of feat to be added to the layer
for (int i =0; i < sentencesLayer.size(); i++){
eu.clarin.weblicht.wlfxb.tc.api.Sentence sent = sentencesLayer.getSentence(i);
Token[] tokens = sentencesLayer.getTokens(sent);
sentenceList = new ArrayList<>();
for (Token t : tokens){
sentenceList.add(new Word(t.getString()));
}
Sentence sentence = new Sentence(sentenceList);
lemma_tags= tagger.tagWithLemma(sentence);
List<Feature> feats;
String[] morph;
for (int j =0; j<tokens.length; j++){
posTagsLayer.addTag(lemma_tags.get(j).get(1), tokens[j]);
morph = lemma_tags.get(j).get(2).split("\\|");
feats = new ArrayList<>();
for (String str : morph){
//what shall I name the values?
feats.add(morphLayer.createFeature(str, str));
}
morphLayer.addAnalysis(tokens[j], feats);
}
}
//System.out.println(lemma_tags);
//for (int i= 0; i < lemma_tags.size(); i++) {
//System.out.println(String.format("token: %s : posTag %s", tokensLayer.getToken(i), lemma_tags.get(i)));
//}
} finally {
}
}
}
package de.tuebingen.uni.sfs.clarind.core;
//not used - unnecessary
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import marmot.core.Sequence;
import marmot.morph.Sentence;
import marmot.morph.Word;
import marmot.morph.io.FileOptions;
import marmot.util.Converter;
import marmot.util.LineIterator;
public class WordReader implements Iterable<Sequence> {
private List<List<String>> tokenList_;
public WordReader(List<List<String>> tokenList) {
tokenList_ = tokenList;
}
@Override
public Iterator<Sequence> iterator() {
final Iterator<List<String>> iterator = tokenList_.iterator();
return new Iterator<Sequence>() {
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public Sequence next() {
List<Word> tokens = new LinkedList<Word>();
while (iterator.hasNext()) {
List<String> fields = iterator.next();
if (fields.isEmpty()) {
if (!tokens.isEmpty()) {
break;
}
} else {
for (String field : fields) {
Word word = new Word(field);
tokens.add(word);
}
}
}
return new Sentence(tokens);
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
}
package de.tuebingen.uni.sfs.clarind.resources;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import java.io.InputStream;
/**
* Resource that serves up the index page.
*/
@Path("/")
public class IndexResource {
@GET
@Produces("text/html")
public InputStream index() {
return getClass().getResourceAsStream("/index.html");
}
@GET
@Path("/input.xml")
@Produces("text/xml")
public InputStream inputTestData() {
return getClass().getResourceAsStream("/input.xml");
}
}
package de.tuebingen.uni.sfs.clarind.resources;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
import javax.ws.rs.*;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.StreamingOutput;
import de.tuebingen.uni.sfs.clarind.core.MarmotTool;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
@Path("annotate")
public class MarmotResource {
private static final String TEXT_TCF_XML = "text/tcf+xml";
private static final String FALL_BACK_MESSAGE = "Data processing failed";
private static final String TEMP_FILE_PREFIX = "references-output-temp";
private static final String TEMP_FILE_SUFFIX = ".xml";
private TextCorpusProcessor marmotTool;
public MarmotResource() {
marmotTool = new MarmotTool();
}
@Path("marmot/bytes")
@POST
@Consumes(TEXT_TCF_XML)
@Produces(TEXT_TCF_XML)
public Response processMarmotWithBytesArray(final InputStream input) {
// prepare the storage for TCF output
ByteArrayOutputStream output = new ByteArrayOutputStream();
// process incoming TCF and output resulting TCF with new annotation layer(s) added
process(input, output, marmotTool);
// if no exceptions occur to this point, return OK status and TCF output
// with the added annotation layer(s)
return Response.ok(output.toByteArray()).build();
}
@Path("marmot/stream")
@POST
@Consumes(TEXT_TCF_XML)
@Produces(TEXT_TCF_XML)
public StreamingOutput processMarmotWithStreaming(final InputStream input) {
// prepare temporary file and temprary output stream for writing TCF
OutputStream tempOutputData = null;
File tempOutputFile = null;
try {
tempOutputFile = File.createTempFile(TEMP_FILE_PREFIX, TEMP_FILE_SUFFIX);
tempOutputData = new BufferedOutputStream(new FileOutputStream(tempOutputFile));
} catch (IOException ex) {
if (tempOutputData != null) {
try {
tempOutputData.close();
} catch (IOException e) {
throw new WebApplicationException(createResponse(ex, Response.Status.INTERNAL_SERVER_ERROR));
}
}
if (tempOutputFile != null) {
tempOutputFile.delete();
}
throw new WebApplicationException(createResponse(ex, Response.Status.INTERNAL_SERVER_ERROR));
}
// process incoming TCF and output resulting TCF with new annotation layer(s) added
process(input, tempOutputData, marmotTool);
// if there were no errors reading and writing TCF data, the resulting
// TCF can be sent as StreamingOutput from the TCF output temporary file
return new StreamingTempFileOutput(tempOutputFile);
}
private void process(final InputStream input, OutputStream output, TextCorpusProcessor tool) {
TextCorpusStreamed textCorpus = null;
try {
// create TextCorpus object from the client request input,
// only required annotation layers will be read into the object
textCorpus = new TextCorpusStreamed(input, tool.getRequiredLayers(), output, false);
// process TextCorpus and create new annotation layer(s) with your Tool
tool.process(textCorpus);
} catch (TextCorpusProcessorException ex) {
throw new WebApplicationException(createResponse(ex, Response.Status.INTERNAL_SERVER_ERROR));
} catch (WLFormatException ex) {
throw new WebApplicationException(createResponse(ex, Response.Status.BAD_REQUEST));
} catch (Exception ex) {
throw new WebApplicationException(createResponse(ex, Response.Status.INTERNAL_SERVER_ERROR));
} finally {
try {
if (textCorpus != null) {
// it's important to close the TextCorpusStreamed, otherwise
// the TCF XML output will not be written to the end
textCorpus.close();
}
} catch (Exception ex) {
throw new WebApplicationException(createResponse(ex, Response.Status.INTERNAL_SERVER_ERROR));
}
}
}
/* if exception message is provided, use it as it is;
* if exception message is null, use fall back message
* (needs to be non-empty String in order to prevent
* HTTP container generated html message) */
private Response createResponse(Exception ex, Response.Status status) {
String message = ex.getMessage();
if (message == null) {
message = FALL_BACK_MESSAGE;
}
Logger.getLogger(this.getClass().getName()).log(Level.SEVERE, message, ex);
return Response.status(status).entity(message).type(MediaType.TEXT_PLAIN).build();
}
}
package de.tuebingen.uni.sfs.clarind.resources;
import static org.junit.Assert.*;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
import javax.ws.rs.WebApplicationException;
import org.junit.Test;
public class MarmotResourceTest {
@Test
public void test() throws WebApplicationException, IOException {
MarmotResource mr = new MarmotResource();
InputStream input = new FileInputStream("chainer_result_4000722053992600189.xml");
OutputStream output = new FileOutputStream("output4.xml");
mr.processMarmotWithStreaming(input).write(output);
}
}
package de.tuebingen.uni.sfs.clarind.resources;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.StreamingOutput;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.logging.Level;
import java.util.logging.Logger;
public class StreamingTempFileOutput implements StreamingOutput {
private final File file;
public StreamingTempFileOutput(File file) {this.file = file;}
@Override
public void write(OutputStream out) throws IOException, WebApplicationException {
FileInputStream input = null;
byte[] buffer = new byte[256 * 1024];
ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
try {
input = new FileInputStream(file);
FileChannel channel = input.getChannel();
for (int length = 0; (length = channel.read(byteBuffer)) != -1;) {
out.write(buffer, 0, length);
byteBuffer.clear();
}
} finally {
if (input != null) {
try {
input.close();
} catch (IOException ex) {
Logger.getLogger(StreamingTempFileOutput.class.getName()).log(Level.SEVERE, null, ex);
}
}
file.delete();
}
}
}
================================================================================
marmot
================================================================================
<!DOCTYPE html>
<html>
<head>
<title>marmot-service test</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
<h1>A TCF processing web-service which does morphology analysis and pos tagging</h1>
<p>
This web-service does morphology analysis and pos tagging.
The service processes POST requests containing TCF data with tokens.
</p>
<br/>
<p>You can test the morphology analysis service using curl or wget as follows:</p>
<ul>
<li>download the provided <a href="input.xml">input file</a></li>
<li>Run curl:
<p>
<code> curl -H 'content-type: text/tcf+xml' --data-binary @input.xml -X POST <span class="url"></span>annotate/marmot/stream</code>
</p>
Or:
<p>
<code> curl -H 'content-type: text/tcf+xml' --data-binary @input.xml -X POST <span class="url"></span>annotate/marmot/bytes</code>
</p>
</li>
<li>Or wget:
<p>
<code> wget --post-file=input.xml --header='Content-Type: text/tcf+xml' <span class="url"></span>annotate/marmot/stream</code>
</p>
Or:
<p>
<code> wget --post-file=input.xml --header='Content-Type: text/tcf+xml' <span class="url"></span>annotate/marmot/bytes</code>
</p>
</li>
</ul>
<script type="text/javascript">
var elemList = document.getElementsByClassName("url");
for (var i = 0; i < elemList.length; i++) {
elemList[i].innerHTML = window.location.href;
}
</script>
</body>
</html>
<?xml version="1.0" encoding="UTF-8"?><D-Spin xmlns="http://www.dspin.de/data" version="0.4">
<MetaData xmlns="http://www.dspin.de/data/metadata"><Services><cmd:CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:cmd="http://www.clarin.eu/cmd/1" CMDVersion="1.2" xsi:schemaLocation="http://www.clarin.eu/cmd/1 http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1320657629623/xsd"><cmd:Resources><cmd:ResourceProxyList></cmd:ResourceProxyList><cmd:JournalFileProxyList></cmd:JournalFileProxyList><cmd:ResourceRelationList></cmd:ResourceRelationList></cmd:Resources><cmd:Components><cmd:WebServiceToolChain><cmd:GeneralInfo><cmd:Descriptions><cmd:Description></cmd:Description></cmd:Descriptions><cmd:ResourceName>Custom chain</cmd:ResourceName><cmd:ResourceClass>Toolchain</cmd:ResourceClass></cmd:GeneralInfo><cmd:Toolchain><cmd:ToolInChain><cmd:PID>http://hdl.handle.net/11858/00-1778-0000-0004-BA56-7</cmd:PID><cmd:Parameter value="en" name="lang"></cmd:Parameter><cmd:Parameter value="text/plain" name="type"></cmd:Parameter></cmd:ToolInChain><cmd:ToolInChain><cmd:PID>http://hdl.handle.net/11022/0000-0000-2518-C</cmd:PID></cmd:ToolInChain></cmd:Toolchain></cmd:WebServiceToolChain></cmd:Components></cmd:CMD></Services></MetaData>
<TextCorpus xmlns="http://www.dspin.de/data/textcorpus" lang="en">
<tc:text xmlns:tc="http://www.dspin.de/data/textcorpus">What has happened in recent years has brought to our attention the tremendous problems facing not only to the producers of food but also the consumers. We must strike a balance between them. We must resolve this issue, because it is important that consumers should regain confidence in the food they eat.
One way to achieve this is to have total transparency as far as food labelling is concerned. GMOs are the new challenge facing us. This is something that people are very worried about and quite rightly so: I myself share these concerns. But I think we should not allow our concerns about GMOs to overshadow our concerns about growth promoters used in animal feed or antibiotics used in compound feed. Indeed, we should not allow GMOs to obscure the fact that meat and bonemeal are still included in animal feeds in many countries in Europe. One factor behind these developments that has been referred to in this debate is competition - competition between Member States on the cost of the production of food. These are all areas in which we have to ensure a level playing field: food must be of the same standard in all Member States.
We have had the dioxin scare, BSE and many other problems. The main problem is a financial one, namely who bears the cost? The problem is that the cost is not shared equally between the consumer and the producer: the producer has been forced to bear all the cost. We need an equitable distribution of the extra cost that has been incurred. We must also ensure that the food that is imported into the European Union meets the standards within the European Union. If we do not maintain those standards for imported food then we are going to encounter greater difficulties in the future.
</tc:text>
<tc:tokens xmlns:tc="http://www.dspin.de/data/textcorpus">
<tc:token ID="t_0">What</tc:token>
<tc:token ID="t_1">has</tc:token>
<tc:token ID="t_2">happened</tc:token>
<tc:token ID="t_3">in</tc:token>
<tc:token ID="t_4">recent</tc:token>
<tc:token ID="t_5">years</tc:token>
<tc:token ID="t_6">has</tc:token>
<tc:token ID="t_7">brought</tc:token>
<tc:token ID="t_8">to</tc:token>
<tc:token ID="t_9">our</tc:token>
<tc:token ID="t_10">attention</tc:token>
<tc:token ID="t_11">the</tc:token>
<tc:token ID="t_12">tremendous</tc:token>
<tc:token ID="t_13">problems</tc:token>
<tc:token ID="t_14">facing</tc:token>
<tc:token ID="t_15">not</tc:token>
<tc:token ID="t_16">only</tc:token>
<tc:token ID="t_17">to</tc:token>
<tc:token ID="t_18">the</tc:token>
<tc:token ID="t_19">producers</tc:token>
<tc:token ID="t_20">of</tc:token>
<tc:token ID="t_21">food</tc:token>
<tc:token ID="t_22">but</tc:token>
<tc:token ID="t_23">also</tc:token>
<tc:token ID="t_24">the</tc:token>
<tc:token ID="t_25">consumers</tc:token>
<tc:token ID="t_26">.</tc:token>
<tc:token ID="t_27">We</tc:token>
<tc:token ID="t_28">must</tc:token>
<tc:token ID="t_29">strike</tc:token>
<tc:token ID="t_30">a</tc:token>
<tc:token ID="t_31">balance</tc:token>
<tc:token ID="t_32">between</tc:token>
<tc:token ID="t_33">them</tc:token>
<tc:token ID="t_34">.</tc:token>
<tc:token ID="t_35">We</tc:token>
<tc:token ID="t_36">must</tc:token>
<tc:token ID="t_37">resolve</tc:token>
<tc:token ID="t_38">this</tc:token>
<tc:token ID="t_39">issue</tc:token>
<tc:token ID="t_40">,</tc:token>
<tc:token ID="t_41">because</tc:token>
<tc:token ID="t_42">it</tc:token>
<tc:token ID="t_43">is</tc:token>
<tc:token ID="t_44">important</tc:token>
<tc:token ID="t_45">that</tc:token>
<tc:token ID="t_46">consumers</tc:token>
<tc:token ID="t_47">should</tc:token>
<tc:token ID="t_48">regain</tc:token>
<tc:token ID="t_49">confidence</tc:token>
<tc:token ID="t_50">in</tc:token>
<tc:token ID="t_51">the</tc:token>
<tc:token ID="t_52">food</tc:token>
<tc:token ID="t_53">they</tc:token>
<tc:token ID="t_54">eat</tc:token>
<tc:token ID="t_55">.</tc:token>
<tc:token ID="t_56">One</tc:token>
<tc:token ID="t_57">way</tc:token>
<tc:token ID="t_58">to</tc:token>
<tc:token ID="t_59">achieve</tc:token>
<tc:token ID="t_60">this</tc:token>
<tc:token ID="t_61">is</tc:token>
<tc:token ID="t_62">to</tc:token>
<tc:token ID="t_63">have</tc:token>
<tc:token ID="t_64">total</tc:token>
<tc:token ID="t_65">transparency</tc:token>
<tc:token ID="t_66">as</tc:token>
<tc:token ID="t_67">far</tc:token>
<tc:token ID="t_68">as</tc:token>
<tc:token ID="t_69">food</tc:token>
<tc:token ID="t_70">labelling</tc:token>
<tc:token ID="t_71">is</tc:token>
<tc:token ID="t_72">concerned</tc:token>
<tc:token ID="t_73">.</tc:token>
<tc:token ID="t_74">GMOs</tc:token>
<tc:token ID="t_75">are</tc:token>
<tc:token ID="t_76">the</tc:token>
<tc:token ID="t_77">new</tc:token>
<tc:token ID="t_78">challenge</tc:token>
<tc:token ID="t_79">facing</tc:token>
<tc:token ID="t_80">us</tc:token>
<tc:token ID="t_81">.</tc:token>
<tc:token ID="t_82">This</tc:token>
<tc:token ID="t_83">is</tc:token>
<tc:token ID="t_84">something</tc:token>
<tc:token ID="t_85">that</tc:token>
<tc:token ID="t_86">people</tc:token>
<tc:token ID="t_87">are</tc:token>
<tc:token ID="t_88">very</tc:token>
<tc:token ID="t_89">worried</tc:token>
<tc:token ID="t_90">about</tc:token>
<tc:token ID="t_91">and</tc:token>
<tc:token ID="t_92">quite</tc:token>
<tc:token ID="t_93">rightly</tc:token>
<tc:token ID="t_94">so</tc:token>
<tc:token ID="t_95">:</tc:token>
<tc:token ID="t_96">I</tc:token>
<tc:token ID="t_97">myself</tc:token>
<tc:token ID="t_98">share</tc:token>
<tc:token ID="t_99">these</tc:token>
<tc:token ID="t_100">concerns</tc:token>
<tc:token ID="t_101">.</tc:token>
<tc:token ID="t_102">But</tc:token>
<tc:token ID="t_103">I</tc:token>
<tc:token ID="t_104">think</tc:token>
<tc:token ID="t_105">we</tc:token>
<tc:token ID="t_106">should</tc:token>
<tc:token ID="t_107">not</tc:token>
<tc:token ID="t_108">allow</tc:token>
<tc:token ID="t_109">our</tc:token>
<tc:token ID="t_110">concerns</tc:token>
<tc:token ID="t_111">about</tc:token>
<tc:token ID="t_112">GMOs</tc:token>
<tc:token ID="t_113">to</tc:token>
<tc:token ID="t_114">overshadow</tc:token>
<tc:token ID="t_115">our</tc:token>
<tc:token ID="t_116">concerns</tc:token>
<tc:token ID="t_117">about</tc:token>
<tc:token ID="t_118">growth</tc:token>
<tc:token ID="t_119">promoters</tc:token>
<tc:token ID="t_120">used</tc:token>
<tc:token ID="t_121">in</tc:token>
<tc:token ID="t_122">animal</tc:token>
<tc:token ID="t_123">feed</tc:token>
<tc:token ID="t_124">or</tc:token>
<tc:token ID="t_125">antibiotics</tc:token>
<tc:token ID="t_126">used</tc:token>
<tc:token ID="t_127">in</tc:token>
<tc:token ID="t_128">compound</tc:token>
<tc:token ID="t_129">feed</tc:token>
<tc:token ID="t_130">.</tc:token>
<tc:token ID="t_131">Indeed</tc:token>
<tc:token ID="t_132">,</tc:token>
<tc:token ID="t_133">we</tc:token>
<tc:token ID="t_134">should</tc:token>
<tc:token ID="t_135">not</tc:token>
<tc:token ID="t_136">allow</tc:token>
<tc:token ID="t_137">GMOs</tc:token>
<tc:token ID="t_138">to</tc:token>
<tc:token ID="t_139">obscure</tc:token>
<tc:token ID="t_140">the</tc:token>
<tc:token ID="t_141">fact</tc:token>
<tc:token ID="t_142">that</tc:token>
<tc:token ID="t_143">meat</tc:token>
<tc:token ID="t_144">and</tc:token>
<tc:token ID="t_145">bonemeal</tc:token>
<tc:token ID="t_146">are</tc:token>
<tc:token ID="t_147">still</tc:token>
<tc:token ID="t_148">included</tc:token>
<tc:token ID="t_149">in</tc:token>
<tc:token ID="t_150">animal</tc:token>
<tc:token ID="t_151">feeds</tc:token>
<tc:token ID="t_152">in</tc:token>
<tc:token ID="t_153">many</tc:token>
<tc:token ID="t_154">countries</tc:token>
<tc:token ID="t_155">in</tc:token>
<tc:token ID="t_156">Europe</tc:token>
<tc:token ID="t_157">.</tc:token>
<tc:token ID="t_158">One</tc:token>
<tc:token ID="t_159">factor</tc:token>
<tc:token ID="t_160">behind</tc:token>
<tc:token ID="t_161">these</tc:token>
<tc:token ID="t_162">developments</tc:token>
<tc:token ID="t_163">that</tc:token>
<tc:token ID="t_164">has</tc:token>
<tc:token ID="t_165">been</tc:token>
<tc:token ID="t_166">referred</tc:token>
<tc:token ID="t_167">to</tc:token>
<tc:token ID="t_168">in</tc:token>
<tc:token ID="t_169">this</tc:token>
<tc:token ID="t_170">debate</tc:token>
<tc:token ID="t_171">is</tc:token>
<tc:token ID="t_172">competition</tc:token>
<tc:token ID="t_173">-</tc:token>
<tc:token ID="t_174">competition</tc:token>
<tc:token ID="t_175">between</tc:token>
<tc:token ID="t_176">Member</tc:token>
<tc:token ID="t_177">States</tc:token>
<tc:token ID="t_178">on</tc:token>
<tc:token ID="t_179">the</tc:token>
<tc:token ID="t_180">cost</tc:token>
<tc:token ID="t_181">of</tc:token>
<tc:token ID="t_182">the</tc:token>
<tc:token ID="t_183">production</tc:token>
<tc:token ID="t_184">of</tc:token>
<tc:token ID="t_185">food</tc:token>
<tc:token ID="t_186">.</tc:token>
<tc:token ID="t_187">These</tc:token>
<tc:token ID="t_188">are</tc:token>
<tc:token ID="t_189">all</tc:token>
<tc:token ID="t_190">areas</tc:token>
<tc:token ID="t_191">in</tc:token>
<tc:token ID="t_192">which</tc:token>
<tc:token ID="t_193">we</tc:token>
<tc:token ID="t_194">have</tc:token>
<tc:token ID="t_195">to</tc:token>
<tc:token ID="t_196">ensure</tc:token>
<tc:token ID="t_197">a</tc:token>
<tc:token ID="t_198">level</tc:token>
<tc:token ID="t_199">playing</tc:token>
<tc:token ID="t_200">field</tc:token>
<tc:token ID="t_201">:</tc:token>
<tc:token ID="t_202">food</tc:token>
<tc:token ID="t_203">must</tc:token>
<tc:token ID="t_204">be</tc:token>
<tc:token ID="t_205">of</tc:token>
<tc:token ID="t_206">the</tc:token>
<tc:token ID="t_207">same</tc:token>
<tc:token ID="t_208">standard</tc:token>
<tc:token ID="t_209">in</tc:token>
<tc:token ID="t_210">all</tc:token>
<tc:token ID="t_211">Member</tc:token>
<tc:token ID="t_212">States</tc:token>
<tc:token ID="t_213">.</tc:token>
<tc:token ID="t_214">We</tc:token>
<tc:token ID="t_215">have</tc:token>
<tc:token ID="t_216">had</tc:token>
<tc:token ID="t_217">the</tc:token>
<tc:token ID="t_218">dioxin</tc:token>
<tc:token ID="t_219">scare</tc:token>
<tc:token ID="t_220">,</tc:token>
<tc:token ID="t_221">BSE</tc:token>
<tc:token ID="t_222">and</tc:token>
<tc:token ID="t_223">many</tc:token>
<tc:token ID="t_224">other</tc:token>
<tc:token ID="t_225">problems</tc:token>
<tc:token ID="t_226">.</tc:token>
<tc:token ID="t_227">The</tc:token>
<tc:token ID="t_228">main</tc:token>
<tc:token ID="t_229">problem</tc:token>
<tc:token ID="t_230">is</tc:token>
<tc:token ID="t_231">a</tc:token>
<tc:token ID="t_232">financial</tc:token>
<tc:token ID="t_233">one</tc:token>
<tc:token ID="t_234">,</tc:token>
<tc:token ID="t_235">namely</tc:token>
<tc:token ID="t_236">who</tc:token>
<tc:token ID="t_237">bears</tc:token>
<tc:token ID="t_238">the</tc:token>
<tc:token ID="t_239">cost</tc:token>
<tc:token ID="t_240">?</tc:token>
<tc:token ID="t_241">The</tc:token>
<tc:token ID="t_242">problem</tc:token>
<tc:token ID="t_243">is</tc:token>
<tc:token ID="t_244">that</tc:token>
<tc:token ID="t_245">the</tc:token>
<tc:token ID="t_246">cost</tc:token>
<tc:token ID="t_247">is</tc:token>
<tc:token ID="t_248">not</tc:token>
<tc:token ID="t_249">shared</tc:token>
<tc:token ID="t_250">equally</tc:token>
<tc:token ID="t_251">between</tc:token>
<tc:token ID="t_252">the</tc:token>
<tc:token ID="t_253">consumer</tc:token>
<tc:token ID="t_254">and</tc:token>
<tc:token ID="t_255">the</tc:token>
<tc:token ID="t_256">producer</tc:token>
<tc:token ID="t_257">:</tc:token>
<tc:token ID="t_258">the</tc:token>
<tc:token ID="t_259">producer</tc:token>
<tc:token ID="t_260">has</tc:token>
<tc:token ID="t_261">been</tc:token>
<tc:token ID="t_262">forced</tc:token>
<tc:token ID="t_263">to</tc:token>
<tc:token ID="t_264">bear</tc:token>
<tc:token ID="t_265">all</tc:token>
<tc:token ID="t_266">the</tc:token>
<tc:token ID="t_267">cost</tc:token>
<tc:token ID="t_268">.</tc:token>
<tc:token ID="t_269">We</tc:token>
<tc:token ID="t_270">need</tc:token>
<tc:token ID="t_271">an</tc:token>
<tc:token ID="t_272">equitable</tc:token>
<tc:token ID="t_273">distribution</tc:token>
<tc:token ID="t_274">of</tc:token>
<tc:token ID="t_275">the</tc:token>
<tc:token ID="t_276">extra</tc:token>
<tc:token ID="t_277">cost</tc:token>
<tc:token ID="t_278">that</tc:token>
<tc:token ID="t_279">has</tc:token>
<tc:token ID="t_280">been</tc:token>
<tc:token ID="t_281">incurred</tc:token>
<tc:token ID="t_282">.</tc:token>
<tc:token ID="t_283">We</tc:token>
<tc:token ID="t_284">must</tc:token>
<tc:token ID="t_285">also</tc:token>
<tc:token ID="t_286">ensure</tc:token>
<tc:token ID="t_287">that</tc:token>
<tc:token ID="t_288">the</tc:token>
<tc:token ID="t_289">food</tc:token>
<tc:token ID="t_290">that</tc:token>
<tc:token ID="t_291">is</tc:token>
<tc:token ID="t_292">imported</tc:token>
<tc:token ID="t_293">into</tc:token>
<tc:token ID="t_294">the</tc:token>
<tc:token ID="t_295">European</tc:token>
<tc:token ID="t_296">Union</tc:token>
<tc:token ID="t_297">meets</tc:token>
<tc:token ID="t_298">the</tc:token>
<tc:token ID="t_299">standards</tc:token>
<tc:token ID="t_300">within</tc:token>
<tc:token ID="t_301">the</tc:token>
<tc:token ID="t_302">European</tc:token>
<tc:token ID="t_303">Union</tc:token>
<tc:token ID="t_304">.</tc:token>
<tc:token ID="t_305">If</tc:token>
<tc:token ID="t_306">we</tc:token>
<tc:token ID="t_307">do</tc:token>
<tc:token ID="t_308">not</tc:token>
<tc:token ID="t_309">maintain</tc:token>
<tc:token ID="t_310">those</tc:token>
<tc:token ID="t_311">standards</tc:token>
<tc:token ID="t_312">for</tc:token>
<tc:token ID="t_313">imported</tc:token>
<tc:token ID="t_314">food</tc:token>
<tc:token ID="t_315">then</tc:token>
<tc:token ID="t_316">we</tc:token>
<tc:token ID="t_317">are</tc:token>
<tc:token ID="t_318">going</tc:token>
<tc:token ID="t_319">to</tc:token>
<tc:token ID="t_320">encounter</tc:token>
<tc:token ID="t_321">greater</tc:token>
<tc:token ID="t_322">difficulties</tc:token>
<tc:token ID="t_323">in</tc:token>
<tc:token ID="t_324">the</tc:token>
<tc:token ID="t_325">future</tc:token>
<tc:token ID="t_326">.</tc:token>
</tc:tokens>
<tc:sentences xmlns:tc="http://www.dspin.de/data/textcorpus">
<tc:sentence tokenIDs="t_0 t_1 t_2 t_3 t_4 t_5 t_6 t_7 t_8 t_9 t_10 t_11 t_12 t_13 t_14 t_15 t_16 t_17 t_18 t_19 t_20 t_21 t_22 t_23 t_24 t_25 t_26"></tc:sentence>
<tc:sentence tokenIDs="t_27 t_28 t_29 t_30 t_31 t_32 t_33 t_34"></tc:sentence>
<tc:sentence tokenIDs="t_35 t_36 t_37 t_38 t_39 t_40 t_41 t_42 t_43 t_44 t_45 t_46 t_47 t_48 t_49 t_50 t_51 t_52 t_53 t_54 t_55"></tc:sentence>
<tc:sentence tokenIDs="t_56 t_57 t_58 t_59 t_60 t_61 t_62 t_63 t_64 t_65 t_66 t_67 t_68 t_69 t_70 t_71 t_72 t_73"></tc:sentence>
<tc:sentence tokenIDs="t_74 t_75 t_76 t_77 t_78 t_79 t_80 t_81"></tc:sentence>
<tc:sentence tokenIDs="t_82 t_83 t_84 t_85 t_86 t_87 t_88 t_89 t_90 t_91 t_92 t_93 t_94 t_95 t_96 t_97 t_98 t_99 t_100 t_101"></tc:sentence>
<tc:sentence tokenIDs="t_102 t_103 t_104 t_105 t_106 t_107 t_108 t_109 t_110 t_111 t_112 t_113 t_114 t_115 t_116 t_117 t_118 t_119 t_120 t_121 t_122 t_123 t_124 t_125 t_126 t_127 t_128 t_129 t_130"></tc:sentence>
<tc:sentence tokenIDs="t_131 t_132 t_133 t_134 t_135 t_136 t_137 t_138 t_139 t_140 t_141 t_142 t_143 t_144 t_145 t_146 t_147 t_148 t_149 t_150 t_151 t_152 t_153 t_154 t_155 t_156 t_157"></tc:sentence>
<tc:sentence tokenIDs="t_158 t_159 t_160 t_161 t_162 t_163 t_164 t_165 t_166 t_167 t_168 t_169 t_170 t_171 t_172 t_173 t_174 t_175 t_176 t_177 t_178 t_179 t_180 t_181 t_182 t_183 t_184 t_185 t_186"></tc:sentence>
<tc:sentence tokenIDs="t_187 t_188 t_189 t_190 t_191 t_192 t_193 t_194 t_195 t_196 t_197 t_198 t_199 t_200 t_201 t_202 t_203 t_204 t_205 t_206 t_207 t_208 t_209 t_210 t_211 t_212 t_213"></tc:sentence>
<tc:sentence tokenIDs="t_214 t_215 t_216 t_217 t_218 t_219 t_220 t_221 t_222 t_223 t_224 t_225 t_226"></tc:sentence>
<tc:sentence tokenIDs="t_227 t_228 t_229 t_230 t_231 t_232 t_233 t_234 t_235 t_236 t_237 t_238 t_239 t_240"></tc:sentence>
<tc:sentence tokenIDs="t_241 t_242 t_243 t_244 t_245 t_246 t_247 t_248 t_249 t_250 t_251 t_252 t_253 t_254 t_255 t_256 t_257 t_258 t_259 t_260 t_261 t_262 t_263 t_264 t_265 t_266 t_267 t_268"></tc:sentence>
<tc:sentence tokenIDs="t_269 t_270 t_271 t_272 t_273 t_274 t_275 t_276 t_277 t_278 t_279 t_280 t_281 t_282"></tc:sentence>
<tc:sentence tokenIDs="t_283 t_284 t_285 t_286 t_287 t_288 t_289 t_290 t_291 t_292 t_293 t_294 t_295 t_296 t_297 t_298 t_299 t_300 t_301 t_302 t_303 t_304"></tc:sentence>
<tc:sentence tokenIDs="t_305 t_306 t_307 t_308 t_309 t_310 t_311 t_312 t_313 t_314 t_315 t_316 t_317 t_318 t_319 t_320 t_321 t_322 t_323 t_324 t_325 t_326"></tc:sentence>
</tc:sentences>
</TextCorpus>
</D-Spin>
package de.tuebingen.uni.sfs.clarind.core;
import static org.junit.Assert.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Paths;
import org.junit.Test;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessor;
import eu.clarin.weblicht.wlfxb.api.TextCorpusProcessorException;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
public class MarmotToolTest {
@Test
public void test() throws FileNotFoundException, WLFormatException, TextCorpusProcessorException {
OutputStream output = new FileOutputStream("output.xml");
InputStream input = new FileInputStream("chainer_result_4000722053992600189.xml");
TextCorpusProcessor tool = new MarmotTool();
TextCorpusStreamed textCorpus = new TextCorpusStreamed(input, tool.getRequiredLayers(), output, false);
tool.process(textCorpus);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment