Commit 0b611765 authored by Ben Campbell's avatar Ben Campbell
Browse files

Initial commit

parents
# TYPE is either debug, prof, static, lib, Sun64bit or the empty string
TYPE =
# try different definitions of SGILIB if hash_map is not found
#SGILIB=
SGILIB=-DSGI__gnu_cxx
#SGILIB=-DSGIext
# comment the following line if readline is not found
#READLINE = -DREADLINE
#LREADLINE = -lreadline -lhistory -lncurses
# data type used for the encoding of the symbols (char, short, or int)
# short is the default
#CDT=-DCODE_DATA_TYPE=short
CXX = g++
WARNING = -Wall -Wcast-qual -Wconversion -std=c++98 -Wno-deprecated -ansi
ifeq ($(TYPE),debug)
DEBUG = -ggdb
OPTIMIZATION = -O0
else
DEBUG = -DNDEBUG
ifeq ($(TYPE),prof)
OPTIMIZATION = -O2
PROFILE = -pg
else
OPTIMIZATION = -O3
ifeq ($(TYPE),static)
STATIC = -static
else
ifeq ($(TYPE),Sun64bit)
LOPT = -L/usr/lib/64 -L/usr/local/lib/sparcv9 -R/usr/lib/64 -R/usr/local/lib/sparcv9
CXX = g++64
BITS= -mcpu=v9
else
ifeq ($(TYPE),lib)
LIB = -fPIC
endif
endif
endif
endif
endif
CC = g++
CL = $(CC)
CFLAGS = $(DEBUG) $(PROFILE) $(LIB) $(OPTIMIZATION) $(WARNING) $(SGILIB) $(CDT) $(BITS) $(READLINE)
LDFLAGS = $(DEBUG) $(PROFILE) $(STATIC) $(LOPT)
CXXFLAGS = $(CFLAGS)
LXXFLAGS = $(LDFLAGS)
INSTALL = install
INSTALL_MANPAGES = install -m 644
INSTALL_DIR = install -d -m 755
DESTDIR =
PREFIX = /usr/local/
PROGRAMS = fst-compiler fst-infl fst-generate fst-print fst-compare \
fst-compact fst-infl2 fst-lowmem fst-infl3 fst-lattice fst-match \
fst-parse fst-parse2 fst-train fst-compiler-utf8 fst-text2bin fst-mor
ALLPROGRAMS = $(PROGRAMS)
ifeq ($(TYPE),lib)
TARGET = libsfst.tar.gz
else
ifeq ($(TYPE),debug)
TARGET = $(ALLPROGRAMS)
else
TARGET = strip
endif
endif
OBJ = basic.o utf8.o alphabet.o fst.o operators.o determinise.o hopcroft.o
OBJ2 = $(OBJ) compact.o make-compact.o interface.o
.PHONY: all clean realclean archive remake install strip
all: $(TARGET)
strip: $(ALLPROGRAMS)
strip $(ALLPROGRAMS)
fst-api.zip: test.C alphabet.C alphabet.h basic.C basic.h compact.C \
compact.h lowmem.C lowmem.h robust.C
zip $@ $^
fst-compact: compact.o make-compact.o
fst-generate: generate.o
fst-match: compact.o
fst-train: compact.o
fst-compiler: $(OBJ2) default-scanner.o
fst-compiler-utf8: $(OBJ2) utf8-scanner.o fst-compiler.o
$(CXX) -o $@ $^ $(LDFLAGS)
fst-mor: $(OBJ) fst-mor.o
$(CXX) -o $@ $^ $(LDFLAGS) $(LREADLINE)
fst-text2bin: $(OBJ) fst-text2bin.o
$(CXX) -o $@ $^ $(LDFLAGS)
%: $(OBJ) %.o
$(CXX) -o $@ $^ $(LDFLAGS)
fst-infl2: alphabet.o basic.o compact.o robust.o utf8.o fst-infl2.o
$(CXX) -o $@ $^ $(LDFLAGS)
fst-infl2-daemon: alphabet.o basic.o compact.o robust.o utf8.o fst-infl2-daemon.o
$(CXX) -o $@ $^ $(LDFLAGS)
fst-infl3: alphabet.o basic.o utf8.o lowmem.o fst-infl3.o
$(CXX) -o $@ $^ $(LDFLAGS)
test: test.o
$(CL) -L. -lsfst -o $@ $^ $(LDFLAGS)
fst-compiler.C: fst.h compact.h make-compact.h interface.h fst-compiler.yy
bison -d -o $@ fst-compiler.yy
if [ -f fst-compiler.C.h ]; then mv fst-compiler.C.h fst-compiler.h; fi
if [ -f fst-compiler.H ]; then mv fst-compiler.H fst-compiler.h; fi
default-scanner.C: fst.h interface.h scanner.ll fst-compiler.C default-scanner.ll
flex -o$@ default-scanner.ll
utf8-scanner.C: fst.h interface.h scanner.ll fst-compiler.C utf8-scanner.ll
flex -o$@ utf8-scanner.ll
default-scanner.ll: scanner.ll
sed -e '/^utf8>/d' -e 's/^default> *//' scanner.ll > default-scanner.ll
utf8-scanner.ll: scanner.ll
sed -e '/^default>/d' -e 's/^utf8> *//' scanner.ll > utf8-scanner.ll
libsfst.tar.gz: libsfst.so compact.h alphabet.h basic.h fst.h
tar -zcf $@ $^
libsfst.so: alphabet.o basic.o utf8.o compact.o fst.o operators.o determinise.o generate.o hopcroft.o
$(CL) -shared -o $@ $^ $(LDFLAGS)
remake: clean $(ALLPROGRAMS)
clean:
-rm -f *.o *~ Makefile.bak y.tab.h man1/*~ 2>&- > /dev/null
realclean: clean
-rm -f $(ALLPROGRAMS) test 2>&- > /dev/null
install: $(PROGRAMS)
# $(INSTALL_DIR) $(DESTDIR)$(PREFIX)bin
for p in $(PROGRAMS); do $(INSTALL) $$p $(DESTDIR)$(PREFIX)bin/$$p; done
maninstall:
$(INSTALL_DIR) $(DESTDIR)$(PREFIX)man/man1
for m in man1/*.1; do $(INSTALL_MANPAGES) $$m $(DESTDIR)$(PREFIX)man/$$m; done
archive:
tar -zhcf VERSION-`date '+%y%m%d'`.tar.gz *.ll *.yy *.[Ch] Makefile man1/*
Makefile: *.C *.ll *.yy *.h
-makedepend -Y -- $(CFLAGS) -- *.C 2>/dev/null
# DO NOT DELETE
TransducerTest.o: Transducer.h
alphabet.o: utf8.h alphabet.h basic.h sgi.h
basic.o: basic.h
compact.o: compact.h alphabet.h basic.h sgi.h
default-scanner.o: interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
default-scanner.o: fst-compiler.h
determinise.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-compact.o: make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
fst-compare.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-compiler.o: make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
fst-compiler.o: interface.h utf8.h
fst-generate.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-infl.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-infl2-daemon.o: Socket.h compact.h alphabet.h basic.h sgi.h
fst-infl2.o: compact.h alphabet.h basic.h sgi.h
fst-infl3.o: lowmem.h alphabet.h basic.h sgi.h
fst-lattice.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-lowmem.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-match.o: compact.h alphabet.h basic.h sgi.h
fst-mor.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-parse.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-parse2.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-print.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-text2bin.o: fst.h alphabet.h basic.h sgi.h mem.h
fst-train.o: compact.h alphabet.h basic.h sgi.h
fst.o: fst.h alphabet.h basic.h sgi.h mem.h
generate.o: fst.h alphabet.h basic.h sgi.h mem.h
interface.o: interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
lowmem.o: lowmem.h alphabet.h basic.h sgi.h
make-compact.o: make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
operators.o: fst.h alphabet.h basic.h sgi.h mem.h
robust.o: compact.h alphabet.h basic.h sgi.h
test.o: compact.h alphabet.h basic.h sgi.h
utf8-scanner.o: interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
utf8-scanner.o: fst-compiler.h
utf8.o: utf8.h
hopcraft.o: fst.h alphabet.h basic.h sgi.h mem.h
/*******************************************************************/
/* */
/* File: Socket.h */
/* Author: Helmut Schmid */
/* Purpose: */
/* Created: Fri Aug 15 14:19:19 2008 */
/* Modified: Wed Sep 29 08:44:43 2010 (schmid) */
/* */
/*******************************************************************/
namespace SFST {
/***************** class Socket **********************************/
class Socket {
int portno; /* port address */
int sockfd;
struct sockaddr_in serv_addr;
struct sockaddr cli_addr;
socklen_t clilen;
public:
int next_client() {
return accept( sockfd, (struct sockaddr *)&serv_addr, &clilen);
}
Socket( int port=7070 ): portno( port ) {
/* create a socket */
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
fprintf(stderr, "ERROR opening socket\n");
exit(1);
}
/* initialise serv_addr with zeros */
bzero((char *) &serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
/* convert portno to network byte order and */
/* store it in serv_addr.sin_port */
serv_addr.sin_port = htons(portno);
/* set the host IP address (available in INADDR_ANY) */
serv_addr.sin_addr.s_addr = INADDR_ANY;
/* bind the socket to a host and port */
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
fprintf(stderr, "ERROR on binding\n");
exit(1);
}
/* Listen to the socket; up to 5 connections at a time */
listen(sockfd, 5);
clilen = sizeof(cli_addr);
}
};
}
/*******************************************************************/
/* */
/* File: Transducer.h */
/* Author: Helmut Schmid */
/* */
/*******************************************************************/
#include <stdio.h>
#include <vector>
namespace SFST {
class Transition {
public:
char lower;
char upper;
int target;
Transition( char l, char u, size_t t ) { lower = l; upper = u; target = t; };
};
class State {
public:
bool final;
vector<Transition> transition;
State() { final = false; };
};
class Transducer {
private:
vector<State> state;
void analyze1( int sn, const char *s, vector<char> &ana,
vector<vector<char> > &analyses )
{
if (*s == 0 && state[sn].final)
analyses.push_back( ana );
vector<Transition> &t=state[sn].transition;
for( size_t i=0; i<t.size(); i++ ) {
if (t[i].upper == 0) {
ana.push_back(t[i].lower);
analyze1( t[i].target, s, ana, analyses);
ana.pop_back();
}
else if (t[i].upper == *s) {
ana.push_back(t[i].lower);
analyze1( t[i].target, s+1, ana, analyses);
ana.pop_back();
}
}
}
public:
Transducer( FILE *file ) {
char buffer[1000];
for( unsigned int line=0; (fgets(buffer, 1000, file)); line++ ) {
int s, t;
char u, l;
if (sscanf( buffer, "final: %d", &s) == 1) {
if (s >= (int)state.size())
state.resize( s+1 );
state[s].final = true;
}
else if (sscanf( buffer, "%d %c:<> %d", &s, &l, &t) == 3) {
if (s >= (int)state.size())
state.resize( s+1 );
state[s].transition.push_back(Transition(l, 0, t));
}
else if (sscanf( buffer, "%d <>:%c %d", &s, &u, &t) == 3) {
if (s >= (int)state.size())
state.resize( s+1 );
state[s].transition.push_back(Transition(0, u, t));
}
else if (sscanf( buffer, "%d %c:%c %d", &s, &l, &u, &t) == 4) {
if (s >= (int)state.size())
state.resize( s+1 );
state[s].transition.push_back(Transition(l, u, t));
}
else if (sscanf( buffer, "%d %c %d", &s, &l, &t) == 3) {
if (s >= (int)state.size())
state.resize( s+1 );
state[s].transition.push_back(Transition(l, l, t));
}
else {
fprintf(stderr,"Error: in line %u of transducer file at: %s\n",
line, buffer);
exit(1);
}
}
}
void analyze( const char *s, vector<vector<char> > &analyses ) {
vector<char> ana;
analyze1( 0, s, ana, analyses );
}
};
}
#include "Transducer.h"
#include <iostream>
using std::cerr;
int main( int argc, char **argv )
{
FILE *file;
file = fopen(argv[1],"rb"); // open the input file
if (file == NULL) exit(1);
try {
Transducer transducer(file); // (1) read the transducer
char buffer[1000];
while (fgets(buffer, 1000, stdin)) { // (2) next input line
// delete newline character
int l=strlen(buffer)-1;
if (buffer[l] == '\n')
buffer[l] = '\0';
printf("> %s\n", buffer); // print the input line
vector<vector<char> > analyses;
transducer.analyze(buffer, analyses); // (3) analyse the input
if (analyses.size() == 0)
printf( "no result for %s\n", buffer);
else
for( size_t i=0; i<analyses.size(); i++ ) {
for( size_t k=0; k<analyses[i].size(); k++ )
fputc(analyses[i][k], stdout);
fputc('\n', stdout);
}
}
}
catch (const char *p) {
cerr << p << "\n";
return 1;
}
return 0;
}
/*******************************************************************/
/* */
/* FILE alphabet.C */
/* MODULE alphabet */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE basic FST functions */
/* */
/*******************************************************************/
#include <climits>
#include <cstring>
#include "utf8.h"
#include "alphabet.h"
namespace SFST {
using std::vector;
using std::ostream;
const int BUFFER_SIZE=100000;
char EpsilonString[]="<>";
/*******************************************************************/
/* */
/* Alphabet::add */
/* */
/*******************************************************************/
void Alphabet::add( const char *symbol, Character c )
{
char *s = fst_strdup(symbol);
cm[c] = s;
sm[s] = c;
}
/*******************************************************************/
/* */
/* Alphabet::Alphabet */
/* */
/*******************************************************************/
Alphabet::Alphabet()
{
utf8 = false;
add(EpsilonString, Label::epsilon);
}
/*******************************************************************/
/* */
/* Alphabet::clear */
/* */
/*******************************************************************/
void Alphabet::clear()
{
char **s=new char*[cm.size()];
ls.clear();
sm.clear();
size_t i, n=0;
for( CharMap::iterator it=cm.begin(); it!=cm.end(); it++ )
s[n++] = it->second;
cm.clear();
for( i=0; i<n; i++ )
free(s[i]);
delete[] s;
}
/*******************************************************************/
/* */
/* Alphabet::print */
/* */
/*******************************************************************/
void Alphabet::print(void)
{
for( CharMap::iterator it=cm.begin(); it!=cm.end(); it++ )
fprintf(stderr, "%i\t%s\n", it->first, it->second);
return;
}
/*******************************************************************/
/* */
/* Alphabet::new_marker */
/* */
/*******************************************************************/
Character Alphabet::new_marker()
{
// find some unused character code
for(Character i=1; i!=0; i++)
if (cm.find(i) == cm.end()) {
// create a unique identifier string
char symbol[100];
sprintf(symbol,">%ld<",(long)i);
add(symbol, i);
return i;
}
throw "Error: too many symbols in transducer definition";
}
/*******************************************************************/
/* */
/* is_marker_symbol */
/* */
/*******************************************************************/
static bool is_marker_symbol( const char *s )
{
// recogize strings matching the expression ">[0-9]+<"
if (s != NULL && *s == '>') {
do { s++; } while (*s >= '0' && *s <= '9');
if (*s=='<' && *(s+1) == 0 && *(s-1) != '>')
return true;
}
return false;
}
/*******************************************************************/
/* */
/* Alphabet::delete_markers */
/* */
/*******************************************************************/
void Alphabet::delete_markers()
{
vector<char*> sym;
vector<Character> code;
vector<Label> label;
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
Character c=it->first;
char *s=it->second;
if (!is_marker_symbol(s)) {
sym.push_back(fst_strdup(s));
code.push_back(c);
}
}
for( LabelSet::const_iterator it=begin(); it!=end(); it++ ) {
Label l=*it;
if (!is_marker_symbol(code2symbol(l.upper_char())) &&
!is_marker_symbol(code2symbol(l.lower_char())))
label.push_back(l);
}
clear();
for( size_t i=0; i<sym.size(); i++ ) {
add_symbol(sym[i], code[i]);
free(sym[i]);
}
for( size_t i=0; i<label.size(); i++ )
insert( label[i] );
}
/*******************************************************************/
/* */
/* Alphabet::add_symbol */
/* */
/*******************************************************************/
Character Alphabet::add_symbol(const char *symbol)
{
if (sm.find(symbol) != sm.end())
return sm[symbol];
// assign the symbol to some unused character
for(Character i=1; i!=0; i++)
if (cm.find(i) == cm.end()) {
add(symbol, i);
return i;
}
throw "Error: too many symbols in transducer definition";
}
/*******************************************************************/
/* */
/* Alphabet::add_symbol */
/* */
/*******************************************************************/
void Alphabet::add_symbol( const char *symbol, Character c )
{
// check whether the symbol was previously defined
int sc=symbol2code(symbol);
if (sc != EOF) {
if ((Character)sc == c)
return;
if (strlen(symbol) < 60) {
static char message[100];
sprintf(message, "Error: reinserting symbol '%s' in alphabet with incompatible character value %u %u", symbol, (unsigned)sc, (unsigned)c);
throw message;
}
else
throw "reinserting symbol in alphabet with incompatible character value";
}
// check whether the character is already in use
const char *s=code2symbol(c);
if (s == NULL)
add(symbol, c);
else {
if (strcmp(s, symbol) != 0) {
static char message[100];
if (strlen(symbol) < 70)
sprintf(message,"Error: defining symbol %s as character %d (previously defined as %s)", symbol, (unsigned)c, s);
else
sprintf(message,"Error: defining a (very long) symbol with previously used character");
throw message;
}
}
}
/*******************************************************************/
/* */
/* Alphabet::write_char */
/* */
/*******************************************************************/
void Alphabet::write_char( Character c, char *buffer, int *pos,
bool with_brackets) const
{
const char *s = code2symbol(c);
if (s) {
int i = 0;
int l=(int)strlen(s)-1;
if (!with_brackets && s[i] == '<' && s[l] == '>') { i++; l--; }
while (i <= l)
buffer[(*pos)++] = s[i++];
}
else {
unsigned int uc = c;
if (uc>=32 && uc<256)
buffer[(*pos)++] = (char)c;
else {
sprintf(buffer+(*pos),"\\%u", uc);
*pos += (int)strlen(buffer+(*pos));
}
}
buffer[*pos] = '\0';
}
/*******************************************************************/
/* */
/* Alphabet::write_char */
/* */
/*******************************************************************/
const char *Alphabet::write_char( Character c, bool with_brackets ) const
{
static char buffer[1000];
int n=0;
write_char( c, buffer, &n, with_brackets );
return buffer;
}
/*******************************************************************/
/* */
/* Alphabet::write_label */
/* */
/*******************************************************************/
void Alphabet::write_label( Label l, char *buffer, int *pos,
bool with_brackets ) const
{
Character lc=l.lower_char();
Character uc=l.upper_char();
write_char( lc, buffer, pos, with_brackets );
if (lc != uc) {
buffer[(*pos)++] = ':';
write_char( uc, buffer, pos, with_brackets );
}
}
/*******************************************************************/
/* */
/* Alphabet::write_label */
/* */
/*******************************************************************/
const char *Alphabet::write_label( Label l, bool with_brackets ) const
{
static char buffer[1000];
int n=0;
write_label( l, buffer, &n, with_brackets );
return buffer;
}
/*******************************************************************/
/* */
/* Alphabet::insert_symbols */
/* */
/*******************************************************************/
void Alphabet::insert_symbols( const Alphabet &a )
{
for( CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
add_symbol(it->second, it->first);
}
/*******************************************************************/
/* */
/* Alphabet::complement */
/* */
/*******************************************************************/
void Alphabet::complement( vector<Character> &sym )
{
vector<Character> result;
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
Character c = it->first;
if (c != Label::epsilon) {
size_t i;
for( i=0; i<sym.size(); i++ )
if (sym[i] == c)
break;
if (i == sym.size())
result.push_back(c);
}
}
sym.swap(result);
}
/*******************************************************************/
/* */
/* Alphabet::copy */
/* */
/*******************************************************************/
void Alphabet::copy( const Alphabet &a )
{
insert_symbols( a );
utf8 = a.utf8;
for( LabelSet::const_iterator it=a.begin(); it!=a.end(); it++ )
ls.insert( *it );
}
/*******************************************************************/
/* */
/* Alphabet::compose */
/* */
/*******************************************************************/
void Alphabet::compose( const Alphabet &la, const Alphabet &ua )
{
// insert the symbols
insert_symbols(la);
insert_symbols(ua);
utf8 = la.utf8;
hash_map<Character, hash_set<Character> > cs;
// create a hash table for a quick lookup of the target characters
for( iterator it=ua.begin(); it!=ua.end(); it++ ) {
Character lc=it->lower_char();
if (lc == Label::epsilon)
insert(*it);
else
cs[lc].insert(it->upper_char());
}
for( iterator it=la.begin(); it!=la.end(); it++ ) {
Character uc=it->upper_char();
if (uc == Label::epsilon)
insert(*it);
else {
if (cs.find(uc) != cs.end()) {
hash_set<Character> s=cs[uc];
Character lc=it->lower_char();
for( hash_set<Character>::iterator it=s.begin(); it!=s.end(); it++)
insert(Label(lc, *it));
}
}
}
}
/*******************************************************************/
/* */
/* operator<<(Alphabet) */
/* */
/*******************************************************************/
ostream &operator<<( ostream &s, const Alphabet &a )
{
for( Alphabet::CharMap::const_iterator it=a.cm.begin(); it!=a.cm.end(); it++ )
s << it->first << " -> " << it->second << "\n";
for( Alphabet::iterator it=a.begin(); it!=a.end(); it++ )
s << a.write_label(*it) << " ";
s << "\n";
return s;
}
/*******************************************************************/
/* */
/* Alphabet::next_mcsym */
/* */
/* recognizes multi-character symbols which are enclosed with */
/* angle brackets <...>. If the argument flag insert is true, */
/* the multi-character symbol must be already in the lexicon in */
/* order to be recognized. */
/* */
/*******************************************************************/
int Alphabet::next_mcsym( char* &string, bool insert )
{
char *start=string;
if (*start == '<')
// symbol might start here
for( char *end=start+1; *end; end++ )
if (*end == '>') {
// matching pair of angle brackets found
// mark the end of the substring with \0
char lastc = *(++end);
*end = 0;
int c;
if (insert)
c = add_symbol( start );
else
c = symbol2code(start);
// restore the original string
*end = lastc;
if (c != EOF) {
// symbol found
// return its code
string = end;
return (Character)c;
}
else
// not a complex character
break;
}
return EOF;
}
/*******************************************************************/
/* */
/* Alphabet::next_code */
/* */
/*******************************************************************/
int Alphabet::next_code( char* &string, bool extended, bool insert )
{
if (*string == 0)
return EOF; // finished
int c = next_mcsym(string, insert);
if (c != EOF)
return c;
if (extended && *string == '\\')
string++; // remove quotation
if (utf8) {
unsigned int c = utf8toint( &string );
if (c == 0) {
fprintf(stderr, "Error in UTF-8 encoding!\n");
return EOF; // error encountered in utf8 character
}
return (int)add_symbol(int2utf8(c));
}
else {
char buffer[2];
buffer[0] = *string;
buffer[1] = 0;
string++;
return (int)add_symbol(buffer);
}
}
/*******************************************************************/
/* */
/* Alphabet::next_label */
/* */
/*******************************************************************/
Label Alphabet::next_label( char* &string, bool extended )
{
// read first character
int c = next_code( string, extended );
if (c == EOF)
return Label(); // end of string reached
Character lc=(Character)c;
if (!extended || *string != ':') { // single character?
if (lc == Label::epsilon)
return next_label(string, extended); // ignore epsilon
return Label(lc);
}
// read second character
string++; // jump over ':'
c = next_code( string );
if (c == EOF) {
static char buffer[1000];
sprintf(buffer,"Error: incomplete symbol in input file: %s", string);
throw buffer;
}
Label l(lc, (Character)c);
if (l.is_epsilon())
return next_label(string, extended); // ignore epsilon transitions
return l;
}
/*******************************************************************/
/* */
/* Alphabet::string2symseq */
/* */
/*******************************************************************/
void Alphabet::string2symseq( char *s, vector<Character> &ch )
{
int c;
while ((c = next_code(s, false)) != EOF)
ch.push_back((Character)c);
}
/*******************************************************************/
/* */
/* Alphabet::string2labelseq */
/* */
/*******************************************************************/
void Alphabet::string2labelseq( char *s, vector<Label> &labels )
{
Label l;
while ((l = next_label(s)) != Label::epsilon)
labels.push_back(l);
}
/*******************************************************************/
/* */
/* Alphabet::store */
/* */
/*******************************************************************/
void Alphabet::store( FILE *file ) const
{
char c=(utf8)? (char)1: (char)0;
fputc(c, file);
// write the symbol mapping
Character n=(Character)cm.size();
fwrite(&n, sizeof(n), 1, file);
for( CharMap::const_iterator it=cm.begin(); it!=cm.end(); it++ ) {
Character c=it->first;
char *s=it->second;
fwrite(&c, sizeof(c), 1, file);
fwrite(s, sizeof(char), strlen(s)+1, file);
}
// write the character pairs
n = (Character)size();
fwrite(&n, sizeof(n), 1, file);
for( LabelSet::const_iterator p=ls.begin(); p!=ls.end(); p++ ) {
Character c=p->lower_char();
fwrite(&c, sizeof(c), 1, file);
c = p->upper_char();
fwrite(&c, sizeof(c), 1, file);
}
if (ferror(file))
throw "Error encountered while writing alphabet to file\n";
}
/*******************************************************************/
/* */
/* Alphabet::read */
/* */
/*******************************************************************/
void Alphabet::read( FILE *file )
{
utf8 = (fgetc(file) != 0);
// read the symbol mapping
Character n=0;
read_num(&n, sizeof(n), file);
for( unsigned i=0; i<n; i++) {
char buffer[BUFFER_SIZE];
Character c;
read_num(&c, sizeof(c), file);
if (!read_string(buffer, BUFFER_SIZE, file) ||
feof(file) || ferror(file))
throw "Error1 occurred while reading alphabet!\n";
add_symbol(buffer, c);
}
// read the character pairs
read_num(&n, sizeof(n), file);
if (ferror(file))
throw "Error2 occurred while reading alphabet!\n";
for( unsigned i=0; i<n; i++) {
Character lc, uc;
read_num(&lc, sizeof(lc), file);
read_num(&uc, sizeof(uc), file);
insert(Label(lc, uc));
}
if (ferror(file))
throw "Error3 occurred while reading alphabet!\n";
}
/*******************************************************************/
/* */
/* Alphabet::compute_score */
/* */
/*******************************************************************/
int Alphabet::compute_score( Analysis &ana )
{
// check whether the morpheme boundaries are explicitly marked
// with <X> tags
int score=0;
for( size_t i=0; i<ana.size(); i++ ) {
// get next symbol
const char *sym=write_char(ana[i].lower_char());
if (strcmp(sym,"<X>") == 0)
score--;
}
if (score < 0)
return score;
// No explicit morpheme boundary markers have been found.
// Count the number of part-of-speech and PREF tags.
for( size_t i=0; i<ana.size(); i++ ) {
// get next symbol
const char *sym=write_char(ana[i].lower_char());
// Is it not a multi-character symbol
if (sym[0] != '<' || sym[1] == 0)
continue;
// Is it a POS tag starting with "+" like <+NN>?
if (sym[1] == '+') {
const char *t=sym+2;
for( ; *t >= 'A' && *t <= 'Z'; t++) ;
if (t > sym+2 && *t == '>')
return score;
}
// Is it a potential POS tag (i.e. all uppercase)?
const char *t = sym+1;
for( ; *t >= 'A' && *t <= 'Z'; t++) ;
if (t == sym+1 || *t != '>')
continue;
// uppercase symbol found
if (strcmp(sym,"<SUFF>") == 0 ||
strcmp(sym,"<OLDORTH>") == 0 ||
strcmp(sym,"<NEWORTH>") == 0)
continue; // not what we are looking for
// disprefer nouns with prefixes
if (strcmp(sym,"<PREF>") == 0)
score-=2;
if (strcmp(sym,"<V>") == 0 || strcmp(sym,"<ADJ>") == 0) {
bool is_verb=(strcmp(sym,"<V>")==0);
// get the next non-empty symbol
Character c=Label::epsilon;
size_t k;
for( k=i+1; k<ana.size(); k++ )
if ((c = ana[k].lower_char()) != Label::epsilon)
break;
// Is it a participle
if (c != Label::epsilon) {
sym = write_char(c);
if (strcmp(sym,"<OLDORTH>") == 0 ||
strcmp(sym,"<NEWORTH>") == 0 ||
strcmp(sym,"<SUFF>") == 0) {
for( k++; k<ana.size(); k++ )
if ((c = ana[k].lower_char()) != Label::epsilon)
break;
if (c != Label::epsilon)
sym = write_char(c);
}
if (is_verb &&
(strcmp(sym,"<PPres>") == 0 || strcmp(sym,"<PPast>") == 0))
continue; // don't consider participles as complex
if (!is_verb &&
(strcmp(sym,"<Sup>") == 0 || strcmp(sym,"<Comp>") == 0))
continue; // don't consider participles as complex
}
}
score--;
}
return score;
}
/*******************************************************************/
/* */
/* Alphabet::disambiguate */
/* */
/*******************************************************************/
void Alphabet::disambiguate( vector<Analysis> &analyses )
{
// compute the scores
int bestscore=INT_MIN;
vector<int> score;
for( size_t i=0; i<analyses.size(); i++ ) {
score.push_back(compute_score(analyses[i]));
if (bestscore < score[i])
bestscore = score[i];
}
// delete suboptimal analyses
size_t k=0;
for( size_t i=0; i<analyses.size(); i++ )
if (score[i] == bestscore)
analyses[k++] = analyses[i];
analyses.resize(k);
}
/*******************************************************************/
/* */
/* Alphabet::print_analysis */
/* */
/*******************************************************************/
char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
{
vector<char> ch;
// for each transition
for( size_t i=0; i<ana.size(); i++ ) {
// get the transition label
Label l=ana[i];
const char *s;
// either print the analysis symbol or the whole label
if (both_layers) {
s = write_label(l);
// quote colons
if (strcmp(s,":") == 0)
ch.push_back('\\');
}
else if (l.lower_char() != Label::epsilon)
s = write_char(l.lower_char());
else
continue;
// copy the characters to the character array
while (*s)
ch.push_back(*(s++));
}
ch.push_back(0); // terminate the string
static char *result=NULL;
if (result != NULL)
delete[] result;
result = new char[ch.size()];
for( size_t i=0; i<ch.size(); i++ )
result[i] = ch[i];
return result;
}
/*******************************************************************/
/* */
/* Alphabet::operator== */
/* */
/*******************************************************************/
bool Alphabet::operator==(const Alphabet &alpha) const
{
for ( SymbolMap::const_iterator it = this->sm.begin(); it != this->sm.end(); it++ )
{
SymbolMap::const_iterator alpha_it = alpha.sm.find(it->first);
if ( alpha_it == alpha.sm.end() )
return false;
if ( alpha_it->second == it->second )
return false;
}
for ( SymbolMap::const_iterator alpha_it = alpha.sm.begin(); alpha_it != alpha.sm.end(); alpha_it++ )
{
SymbolMap::const_iterator it = this->sm.find(alpha_it->first);
if ( it == this->sm.end() )
return false;
if ( it->second == alpha_it->second )
return false;
}
return true;
}
}
/*******************************************************************/
/* */
/* FILE alphabet.h */
/* MODULE alphabet */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _ALPHABET_H_
#define _ALPHABET_H_
#include <stdio.h>
#include "basic.h"
#include <set>
#include <vector>
#include <iostream>
#include <cstring>
#include "sgi.h"
namespace SFST {
#ifndef CODE_DATA_TYPE
typedef unsigned short Character; // data type of the symbol codes
#else
typedef unsigned CODE_DATA_TYPE Character;
#endif
// data type used to indicate whether some action is to be performed
// on the analysis level (lower) or the surface level (upper)
typedef enum {upper, lower} Level;
/***************** class Label ***********************************/
class Label {
private:
// data structure where the two symbols are stored
struct {
Character lower;
Character upper;
} label;
public:
static const Character epsilon=0; // code of the empty symbol
// new label with two identical symbols
Label( Character c=epsilon ) { label.lower = label.upper = c; };
// new label with two different symbols
Label( Character c1, Character c2 )
{ label.lower = c1; label.upper = c2; };
// returns the indicated symbol of the label
Character get_char( Level l ) const
{ return ((l==upper)? label.upper: label.lower); };
// returns the "upper" symbol of the label (i.e. the surface symbol)
Character upper_char() const { return label.upper; };
// returns the "lower" symbol of the label (i.e. the analysis symbol)
Character lower_char() const { return label.lower; };
// replaces symbols in a label
Label replace_char( Character c, Character nc ) const {
Label l = *this;
if (l.label.lower == c)
l.label.lower = nc;
if (l.label.upper == c)
l.label.upper = nc;
return l;
};
// operators checking the equality of labels
int operator==( Label l ) const
{ return (label.lower==l.label.lower && label.upper==l.label.upper); };
int operator!=( Label l ) const
{ return !(l == *this); };
// comparison operator needed for sorting labels in compact.C
int operator<( Label l ) const {
if (upper_char() < l.upper_char())
return true;
if (upper_char() > l.upper_char())
return false;
if (lower_char() < l.lower_char())
return true;
return false;
};
int operator>( Label l ) const {
if (upper_char() > l.upper_char())
return true;
if (upper_char() < l.upper_char())
return false;
if (lower_char() > l.lower_char())
return true;
return false;
};
// check whether the label is epsilon (i.e. both symbols are epsilon)
// transitions with epsilon labels are epsilon transitions
int is_epsilon() const
{ return (label.upper == epsilon && label.lower == epsilon); };
// check whether the "upper" symbol is epsilon
int upper_is_epsilon() const
{ return (label.upper == epsilon); };
// check whether the "lower" symbol is epsilon
int lower_is_epsilon() const
{ return (label.lower == epsilon); };
// hash function needed to store labels in a hash table
struct label_hash {
size_t operator() ( const Label l ) const {
return (size_t)l.lower_char() ^
((size_t)l.upper_char() << 16) ^
((size_t)l.upper_char() >> 16);
}
};
// hash function needed to store labels in a hash table
struct label_cmp {
bool operator() ( const Label l1, const Label l2 ) const {
return (l1.lower_char() < l2.lower_char() ||
(l1.lower_char() == l2.lower_char() &&
l1.upper_char() < l2.upper_char()));
}
};
// comparison operator needed to store labels in a hash table
struct label_eq {
bool operator() ( const Label l1, const Label l2 ) const {
return (l1.lower_char() == l2.lower_char() &&
l1.upper_char() == l2.upper_char());
}
};
};
typedef std::vector<Label> Analysis;
/***************** class Alphabet *******************************/
class Alphabet {
// string comparison operators needed to stored strings in a hash table
struct eqstr {
bool operator()(const char* s1, const char* s2) const {
return strcmp(s1, s2) == 0;
}
};
// data structure storing labels without repetitions (i.e. as a set)
typedef std::set<Label, Label::label_cmp> LabelSet;
// hash table used to map the symbols to their codes
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
public: // HFST addition
// hash table used to map the codes back to the symbols
typedef hash_map<Character, char*> CharMap;
// HFST addition
bool operator==(const Alphabet &alpha) const;
private:
SymbolMap sm; // maps symbols to codes
CharMap cm; // maps codes to symbols
LabelSet ls; // set of labels known to the alphabet
// add a new symbol with symbol code c
void add( const char *symbol, Character c );
public:
bool utf8;
// iterators over the set of known labels
typedef LabelSet::iterator iterator;
typedef LabelSet::const_iterator const_iterator;
Alphabet();
~Alphabet() { clear(); };
const_iterator begin() const { return ls.begin(); };
const_iterator end() const { return ls.end(); };
size_t size() const { return ls.size(); };
// HFST additions
CharMap get_char_map(void) { return cm; };
void print(void);
void clear();
void clear_char_pairs() { ls.clear(); };
// lookup a label in the alphabet
iterator find( Label l ) { return ls.find(l); };
// insert a label in the alphabet
void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
// insert the known symbols from another alphabet
void insert_symbols( const Alphabet& );
// insert the labels and known symbols from another alphabet
void copy( const Alphabet& );
// create the alphabet of a transducer obtained by a composition operation
void compose( const Alphabet &la, const Alphabet &ua );
// add a symbol to the alphabet and return its code
Character add_symbol(const char *symbol);
// add a symbol to the alphabet with a given code
void add_symbol(const char *symbol, Character c );
// create a new marker symbol and return its code
Character new_marker( void );
void delete_markers();
// compute the complement of a symbol set
void complement( std::vector<Character> &sym );
// return the code of the argument symbol
int symbol2code( const char *s ) const {
SymbolMap::const_iterator p = sm.find(s);
if (p != sm.end()) return p->second;
return EOF;
};
// return the symbol for the given symbol code
const char *code2symbol( Character c ) const {
CharMap::const_iterator p=cm.find(c);
if (p == cm.end())
return NULL;
else
return p->second;
};
// write the symbol for the given symbol code into a string
void write_char( Character c, char *buffer, int *pos,
bool with_brackets=true ) const;
// write the symbol pair of a given label into a string
void write_label( Label l, char *buffer, int *pos,
bool with_brackets=true ) const;
// write the symbol for the given symbol code into a buffer and return
// a pointer to it
// the flag "with_brackets" indicates whether the angle brackets
// surrounding multi-character symbols are to be printed or not
const char *write_char( Character c, bool with_brackets=true ) const;
// write the symbol pair of a given label into a string
// and return a pointer to it
const char *write_label( Label l, bool with_brackets=true ) const;
// scan the next multi-character symbol in the argument string
int next_mcsym( char*&, bool insert=true );
// scan the next symbol in the argument string
int next_code( char*&, bool extended=true, bool insert=true );
// convert a character string into a symbol or label sequence
void string2symseq( char*, std::vector<Character>& );
void string2labelseq( char*, std::vector<Label>& );
// scan the next label in the argument string
Label next_label( char*&, bool extended=true );
// store the alphabet in the argument file (in binary form)
void store( FILE* ) const;
// read the alphabet from the argument file
void read( FILE* );
// disambiguation and printing of analyses
int compute_score( Analysis &ana );
void disambiguate( std::vector<Analysis> &analyses );
char *print_analysis( Analysis &ana, bool both_layers );
friend std::ostream &operator<<(std::ostream&, const Alphabet&);
};
// write the alphabet to the output stream (in readable form)
std::ostream &operator<<(std::ostream&, const Alphabet&);
}
#endif
/*******************************************************************/
/* */
/* FILE basic.C */
/* MODULE basic */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE */
/* */
/*******************************************************************/
#include <stdlib.h>
#include <string.h>
#include "basic.h"
namespace SFST {
bool Switch_Bytes=false;
/*******************************************************************/
/* */
/* fst_strdup */
/* */
/*******************************************************************/
char* fst_strdup(const char* pString)
{
char* pStringCopy = (char*)malloc(strlen(pString) + 1);
if (pStringCopy == NULL) {
fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
exit(1);
}
strcpy(pStringCopy, pString);
return pStringCopy;
}
/*******************************************************************/
/* */
/* read_string */
/* */
/*******************************************************************/
int read_string( char *buffer, int size, FILE *file )
{
for( int i=0; i<size; i++ ) {
int c=fgetc(file);
if (c == EOF || c == 0) {
buffer[i] = 0;
return (c==0);
}
buffer[i] = (char)c;
}
buffer[size-1] = 0;
return 0;
}
/*******************************************************************/
/* */
/* read_num */
/* */
/*******************************************************************/
size_t read_num( void *p, size_t n, FILE *file )
{
char *pp=(char*)p;
size_t result=fread( pp, 1, n, file );
if (Switch_Bytes) {
size_t e=n/2;
for( size_t i=0; i<e; i++ ) {
char tmp=pp[i];
pp[i] = pp[--n];
pp[n] = tmp;
}
}
return result;
}
}
/*******************************************************************/
/* */
/* FILE basic.h */
/* MODULE basic */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE */
/* */
/*******************************************************************/
#ifndef _BASIC_H_
#define _BASIC_H_
#include <stdio.h>
namespace SFST {
extern bool Switch_Bytes;
char* fst_strdup(const char* pString);
int read_string( char *buffer, int size, FILE *file );
size_t read_num( void *p, size_t size, FILE *file );
}
#endif
/*******************************************************************/
/* */
/* FILE compact.C */
/* MODULE compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for analysing data */
/* */
/*******************************************************************/
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include "compact.h"
namespace SFST {
using std::equal_range;
using std::vector;
using std::pair;
class label_less {
public:
bool operator()(const Label l1, const Label l2) const {
return l1.upper_char() < l2.upper_char();
}
};
const int BUFFER_SIZE=1000;
/*******************************************************************/
/* */
/* CompactTransducer::convert */
/* */
/*******************************************************************/
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
{
ana.resize(cana.size());
for( size_t i=0; i<cana.size(); i++ )
ana[i] = label[cana[i]];
}
/*******************************************************************/
/* */
/* CompactTransducer::analyze */
/* */
/*******************************************************************/
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
size_t ipos, CAnalysis &ca,
vector<CAnalysis> &analyses )
{
// "n" is the number of the current transducer node/state
// "input" is the sequence of input symbols
// "ipos" is the input position currently analysed
// "ca" stores the incomplete analysis string
// "analyses" stores the analyses found so far
if (analyses.size() > 10000)
return; // limit the maximal number of analyses
// Is the input string fully analyzed and the current node a final node?
if (finalp[n] && ipos == input.size())
// store the new analysis
analyses.push_back(ca);
// follow the epsilon transitions
// first_arc[n] is the number of the first outgoing transition of node n
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
// first_arc[n+1] is the number of the first outgoing transition of node n+1
unsigned int i;
for( i=first_arc[n];
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
i++)
{
ca.push_back(i);
analyze(target_node[i], input, ipos, ca, analyses);
ca.pop_back();
}
// follow the non-epsilon transitions
// scan the next input symbol
if (ipos < input.size()) {
// find the set of arcs with matching upper character in the sorted list
pair<Label*,Label*>range =
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]),
label_less());
unsigned int to = (unsigned int)(range.second - label);
// follow the non-epsilon transitions
for( i=(unsigned)(range.first-label); i<to; i++) {
ca.push_back(i);
analyze(target_node[i], input, ipos+1, ca, analyses);
ca.pop_back();
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::analyze_string */
/* */
/*******************************************************************/
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
{
// "s" input string to be analyzed
// "analyses" is the data structure in which the results are stored
// and returned
vector<Character> input;
alphabet.string2symseq( s, input );
analyses.clear();
CAnalysis ca; // data structure where the current incomplete analysis
// is stored
analyze(0, input, 0, ca, analyses); // start the analysis
if (analyses.size() > 10000)
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
if (simplest_only && analyses.size() > 1)
disambiguate( analyses ); // select the simplest analyses
}
/*******************************************************************/
/* */
/* CompactTransducer::~CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer::~CompactTransducer()
{
delete[] finalp;
delete[] first_arc;
delete[] label;
delete[] target_node;
delete[] final_logprob;
delete[] arc_logprob;
}
/*******************************************************************/
/* */
/* CompactTransducer::CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer::CompactTransducer()
{
both_layers = false;
simplest_only = false;
number_of_nodes = 0;
number_of_arcs = 0;
finalp = NULL;
first_arc = NULL;
label = NULL;
target_node = NULL;
arc_logprob = final_logprob = (float*)NULL;
}
/*******************************************************************/
/* */
/* CompactTransducer::read_finalp */
/* */
/*******************************************************************/
void CompactTransducer::read_finalp( FILE *file )
{
int k=0;
unsigned char n=0;
for( size_t i=0; i<number_of_nodes; i++ ) {
if (k == 0) {
n = (unsigned char)fgetc(file);
k = 8;
}
k--;
if (n & (1 << k))
finalp[i] = 1;
else
finalp[i] = 0;
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_first_arcs */
/* */
/*******************************************************************/
void CompactTransducer::read_first_arcs( FILE *file )
{
int k=0;
unsigned int n=0;
int bits=(int)ceil(log(number_of_arcs+1)/log(2));
for( size_t i=0; i<=number_of_nodes; i++ ) {
first_arc[i] = n >> (sizeof(n)*8 - bits);
n <<= bits;
k -= bits;
if (k < 0) {
read_num(&n,sizeof(n),file);
first_arc[i] |= n >> (sizeof(n)*8 + k);
n <<= -k;
k += (int)sizeof(n) * 8;
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_target_nodes */
/* */
/*******************************************************************/
void CompactTransducer::read_target_nodes( FILE *file )
{
int k=0;
unsigned int n=0;
int bits=(int)ceil(log(number_of_nodes)/log(2));
for( size_t i=0; i<number_of_arcs; i++ ) {
target_node[i] = n >> (sizeof(n)*8 - bits);
n <<= bits;
k -= bits;
if (k < 0) {
read_num(&n,sizeof(n),file);
target_node[i] |= n >> (sizeof(n)*8 + k);
n <<= -k;
k += (int)sizeof(n) * 8;
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_labels */
/* */
/*******************************************************************/
void CompactTransducer::read_labels( FILE *file )
{
size_t N=0;
Label Num2Label[alphabet.size()];
for( Alphabet::const_iterator it=alphabet.begin();
it != alphabet.end(); it++ )
{
Label l=*it;
Num2Label[N++] = l;
}
int k=0;
unsigned int n=0;
int bits=(int)ceil(log((double)alphabet.size())/log(2));
for( size_t i=0; i<number_of_arcs; i++ ) {
unsigned int l = n >> (sizeof(n)*8 - bits);
n <<= bits;
k -= bits;
if (k < 0) {
read_num(&n,sizeof(n),file);
l |= n >> (sizeof(n)*8 + k);
n <<= -k;
k += (int)sizeof(n) * 8;
}
label[i] = Num2Label[l];
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_probs */
/* */
/*******************************************************************/
void CompactTransducer::read_probs( FILE *file )
{
size_t n,m;
fread(&n, sizeof(n), 1, file);
if (fread(&m, sizeof(n), 1, file) != 1 ||
n != node_count() || m != arc_count())
{
fprintf(stderr,"Error: incompatible probability file!\n");
exit(1);
}
final_logprob = new float[n];
arc_logprob = new float[m];
fread(final_logprob, sizeof(float), n, file);
if (fread(arc_logprob, sizeof(float), n, file) != n) {
fprintf(stderr,"Error: in probability file!\n");
exit(1);
}
}
/*******************************************************************/
/* */
/* CompactTransducer::CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
{
both_layers = false;
simplest_only = false;
if (fgetc(file) != 'c')
throw "Error: wrong file format (not a compact transducer)\n";
alphabet.read(file);
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
if (!ferror(file)) {
// memory allocation
finalp = new char[number_of_nodes];
first_arc = new unsigned[number_of_nodes+1];
label = new Label[number_of_arcs];
target_node = new unsigned[number_of_arcs];
// reading the data
read_finalp(file);
read_first_arcs(file);
read_labels(file);
read_target_nodes(file);
}
if (pfile == NULL)
arc_logprob = final_logprob = (float*)NULL;
else
read_probs(pfile);
}
/*******************************************************************/
/* */
/* CompactTransducer::longest_match2 */
/* */
/*******************************************************************/
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
CAnalysis &ca, int &bl, CAnalysis &ba)
{
// n: transducer state
// string: rest string
// l: length of current analysis
// bl: length of the currently longest match
// ca: current analysis
// ba: best analysis
if (finalp[n] && l > bl) {
// store the new analysis
bl = l;
ba = ca; // copy the arc vector
}
// follow the epsilon transitions
unsigned int i;
for( i=first_arc[n];
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
i++)
{
ca.push_back(i);
longest_match2(target_node[i], string, l, ca, bl, ba);
ca.pop_back();
}
// follow the non-epsilon transitions
char *end=string;
int c=alphabet.next_code(end, false, false);
l += (int)(end - string);
if (c != EOF) {
// find the set of arcs with matching upper character in the sort list
pair<Label*,Label*>range =
equal_range(label+i, label+first_arc[n+1], Label((Character)c),
label_less());
unsigned int to = (unsigned int)(range.second - label);
for( i=(unsigned)(range.first-label); i<to; i++) {
ca.push_back(i);
longest_match2(target_node[i], end, l, ca, bl, ba);
ca.pop_back();
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::print_analysis */
/* */
/*******************************************************************/
char *CompactTransducer::print_analysis( CAnalysis &cana )
{
Analysis ana;
convert(cana, ana);
return alphabet.print_analysis( ana, both_layers );
}
/*******************************************************************/
/* */
/* CompactTransducer::longest_match */
/* */
/*******************************************************************/
const char *CompactTransducer::longest_match( char* &string )
{
vector<char> analysis;
CAnalysis ca, ba;
int l=0;
longest_match2(0, string, 0, ca, l, ba);
// no match? return the next character
if (ba.size() == 0) {
int c=alphabet.next_code(string, false, false);
return alphabet.code2symbol((Character)c);
}
string += l;
return print_analysis( ba );
}
/*******************************************************************/
/* */
/* CompactTransducer::disambiguate */
/* */
/*******************************************************************/
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
{
// compute the scores
int bestscore=INT_MIN;
vector<int> score;
Analysis ana;
for( size_t i=0; i<analyses.size(); i++ ) {
convert(analyses[i], ana);
score.push_back(alphabet.compute_score(ana));
if (bestscore < score[i])
bestscore = score[i];
}
// delete suboptimal analyses
size_t k=0;
for( size_t i=0; i<analyses.size(); i++ )
if (score[i] == bestscore)
analyses[k++] = analyses[i];
analyses.resize(k);
}
/*******************************************************************/
/* */
/* CompactTransducer::train2 */
/* */
/*******************************************************************/
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
vector<double> &finalfreq )
{
vector<CAnalysis> analyses;
vector<Label> input;
alphabet.string2labelseq( s, input );
CAnalysis ca; // data structure where the analysis is stored
unsigned int n=0;
bool failure=false;
for( size_t i=0; i<input.size(); i++ ) {
failure = true;
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
if (label[k] == input[i]) {
ca.push_back(k);
n = target_node[k];
failure = false;
break;
}
}
if (failure)
break;
}
if (failure || !finalp[n]) {
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
return false;
}
for( size_t k=0; k<ca.size(); k++ )
arcfreq[ca[k]]++;
finalfreq[target_node[ca.back()]]++;
return true;
}
/*******************************************************************/
/* */
/* CompactTransducer::train */
/* */
/*******************************************************************/
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
vector<double> &finalfreq )
{
vector<CAnalysis> analyses;
vector<Character> input;
alphabet.string2symseq( s, input );
CAnalysis ca; // data structure where the current incomplete analysis
// is stored
analyze(0, input, 0, ca, analyses); // start the analysis
if (analyses.size() > 10000)
return true; // ignore inputs with more than 10000 analyses
else if (analyses.size() == 0)
return false;
if (simplest_only && analyses.size() > 1)
disambiguate( analyses ); // select the simplest analyses
if (analyses.size() > 0) {
double incr = 1.0 / (double)analyses.size();
CAnalysis arcs;
for( size_t i=0; i<analyses.size(); i++ ) {
CAnalysis &arcs=analyses[i];
for( size_t k=0; k<arcs.size(); k++ )
arcfreq[arcs[k]] += incr;
finalfreq[target_node[arcs.back()]] += incr;
}
}
return true;
}
/*******************************************************************/
/* */
/* CompactTransducer::estimate_probs */
/* */
/*******************************************************************/
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
vector<double> &finalfreq )
{
// turn frequencies into probabilities
for( size_t n=0; n<finalfreq.size(); n++ ) {
double sum = finalfreq[n];
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
sum += arcfreq[a];
if (sum == 0.0)
sum = 1.0;
finalfreq[n] = finalfreq[n] / sum;
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
arcfreq[a] = arcfreq[a] / sum;
}
}
/*******************************************************************/
/* */
/* CompactTransducer::compute_probs */
/* */
/*******************************************************************/
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
vector<double> &prob )
{
prob.resize(analyses.size());
double sum=0.0;
for( size_t i=0; i<analyses.size(); i++ ) {
CAnalysis &a=analyses[i];
// compute the probability
double logprob=0.0;
for( size_t k=0; k<a.size(); k++ )
logprob += arc_logprob[a[k]];
logprob += final_logprob[target_node[a.back()]];
prob[i] = exp(logprob);
sum += prob[i];
}
// sort the analyses
vector<CAnalysis> oldanalyses(analyses);
vector<double> oldprob(prob);
for( size_t i=0; i<analyses.size(); i++ ) {
prob[i] = -1.0;
size_t n=0;
for( size_t k=0; k<oldanalyses.size(); k++ )
if (prob[i] < oldprob[k]) {
prob[i] = oldprob[k];
n = k;
}
analyses[i] = oldanalyses[n];
oldprob[n] = -1.0;
prob[i] /= sum; // normalization
}
}
}
/*******************************************************************/
/* */
/* FILE compact.h */
/* MODULE compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _COMPACT_H_
#define _COMPACT_H_
#include "alphabet.h"
#include <vector>
namespace SFST {
typedef std::vector<unsigned int> CAnalysis;
class CompactTransducer {
protected:
// the following data structures are used to store the nodes
unsigned int number_of_nodes; // number of nodes in the transducer
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
unsigned int *first_arc; // first_arc[i] is the number of the first
// arc outgoing from node i
// the following data structures are used to store the transition arcs
unsigned int number_of_arcs; // total number of arcs in the transducer
Label *label; // the label (character pair) of arc i
unsigned int *target_node; // target node of arc i
// the following data structures are used to store the stochastic parameters
float *final_logprob;
float *arc_logprob;
// functions needed to read the transducer from a file
void read_finalp( FILE *file );
void read_first_arcs( FILE *file );
void read_target_nodes( FILE *file );
void read_labels( FILE *file );
void read_probs( FILE *file );
// functions needed to analyze data with the transducer
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
CAnalysis&, std::vector<CAnalysis>&);
// function selecting the simplest morphological analysis
int compute_score( CAnalysis &ana );
void disambiguate( std::vector<CAnalysis> &analyses );
// functions for longest-match analysis of input data
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
void convert( CAnalysis &cana, Analysis &ana );
public:
size_t node_count() { return number_of_nodes; };
size_t arc_count() { return number_of_arcs; };
bool both_layers; // print surface and analysis symbols
bool simplest_only; // print only the simplest analyses
Alphabet alphabet; // data structure which maps symbols to numeric codes
CompactTransducer(); // dummy constructor
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
~CompactTransducer(); // destroys a transducer
// the analysis function returns the set of analyses for the string "s"
// in the argument "analyses"
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
char *print_analysis( CAnalysis &ana );
// longest-match analysis
const char *longest_match( char*& );
// EM training
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
// robust analysis
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
float ErrorsAllowed );
};
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment