Commit 0b611765 authored by Ben Campbell's avatar Ben Campbell
Browse files

Initial commit

parents
/*******************************************************************/
/* */
/* FILE generate.C */
/* MODULE generate */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE prints automata */
/* */
/*******************************************************************/
#include "fst.h"
using std::vector;
namespace SFST {
const int BUFFER_SIZE = 10000;
/*******************************************************************/
/* */
/* Transducer::generate1 */
/* */
/*******************************************************************/
bool Transducer::generate1( Node *node, Node2Int &visitations, char *buffer,
int pos, char *buffer2, int pos2, FILE *file )
{
bool result = false;
if (pos > BUFFER_SIZE - 100 || pos2 > BUFFER_SIZE - 100)
return result;
if (node->is_final()) {
buffer[pos] = '\n';
buffer[pos+1] = 0;
if (buffer2) {
buffer2[pos2] = '\t';
buffer2[pos2+1] = 0;
fputs(buffer2, file);
}
fputs(buffer, file);
result = true;
}
visitations[node]++;
// sort arcs by number of visitations
vector<Arc*> arc;
for( ArcsIter p(node->arcs()); p; p++ ) {
Arc *a=p;
Node *n=a->target_node();
size_t i;
for( i=0; i<arc.size(); i++ )
if (visitations[n] < visitations[arc[i]->target_node()])
break;
arc.push_back(NULL);
for( size_t k=arc.size()-1; k>i; k-- )
arc[k] = arc[k-1];
arc[i] = a;
}
for( size_t i=0; i<arc.size(); i++ ) {
int p = pos;
int p2 = pos2;
Label l = arc[i]->label();
if (buffer2) {
Character lc=l.lower_char();
Character uc=l.upper_char();
if (lc != Label::epsilon)
alphabet.write_char(lc, buffer, &p );
if (uc != Label::epsilon)
alphabet.write_char(uc, buffer2, &p2 );
} else
alphabet.write_label(l, buffer, &p );
result |= generate1( arc[i]->target_node(), visitations,
buffer, p, buffer2, p2, file);
}
return result;
}
/*******************************************************************/
/* */
/* Transducer::generate */
/* */
/*******************************************************************/
bool Transducer::generate( FILE *file, bool separate )
{
bool result;
char buffer[BUFFER_SIZE];
Node2Int visitations;
if (separate) {
char buffer2[BUFFER_SIZE];
result = generate1( root_node(), visitations, buffer, 0, buffer2, 0,file);
}
else
result = generate1( root_node(), visitations, buffer, 0, NULL, 0, file );
return result;
}
}
/*******************************************************************/
/* */
/* FILE hopcroft.C */
/* MODULE hopcroft */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#include "fst.h"
// #define TEST1
// HFST
namespace SFST
{
/***************** class Transition ******************************/
class Transition {
// the representation of a transition used here
public:
size_t source;
Label label;
size_t target;
Transition( size_t s, Label l, size_t t ) : source(s), label(l), target(t) {};
bool operator==( const Transition &t ) const {
return t.source == source && t.label == label && t.target == target;
}
bool operator<( const Transition &t ) const {
if (target < t.target) return true;
if (target > t.target) return false;
if (label < t.label) return true;
if (label > t.label) return false;
if (source < t.source) return true;
return false;
}
};
/***************** class TransitionTable *************************/
// TransitionTable quickly retrieves the transitions to a given state
class TransitionTable {
// transducer representation used here
public:
NodeNumbering nodenumbering; // maps node pointers to IDs and vice versa
vector<Transition> transition;
vector<size_t> startpos; // position of the first transition
// with a given target state
TransitionTable( Transducer &t ) : nodenumbering(t)
{
t.build_transtab( transition, nodenumbering );
std::sort( transition.begin(), transition.end() );
size_t previous_state = 0;
// build the startpos array
startpos.push_back(0);
for( size_t i=0; i<transition.size(); i++ )
if (transition[i].target != previous_state)
while (startpos.size() <= transition[i].target)
startpos.push_back(i);
startpos.push_back( transition.size() );
}
};
/************* class PosRange **********************************/
class PosRange {
// position of the first and last+1 element of a state set
public:
size_t from,to;
PosRange( size_t f, size_t t ) : from(f), to(t) {};
size_t size() { return to - from; }
};
/************* class Stack *************************************/
class Stack {
// stores the IDs of the state sets on the to-do list
private:
vector<size_t> setID; // list of stored IDs needed for LIFO retrieval
set<size_t> store; // data structure needed for quick lookup of an item
public:
void push( size_t id ) {
if (store.find(id) == store.end()) {
setID.push_back( id );
store.insert(id);
}
}
bool is_empty() { return setID.size() == 0; }
size_t pop(void) {
size_t result = setID.back();
setID.pop_back();
store.erase(result);
return result;
}
bool contains( size_t id ) {
return store.find(id) != store.end();
}
};
/************* class Minimiser *********************************/
// table which maps labels to sets of source states
typedef map<Label,set<size_t> > Label2SState;
class Minimiser {
// main class of the minimisation algorithm
Transducer &transducer; // reference to the original transducer
TransitionTable transtab; // internal transducer representation
vector<size_t> partition; // state partition table
// states from the same set occur next to each other in the table
vector<PosRange> posrange; // start+end positions of sets in "partition"
vector<size_t> state2set; // maps states to state sets
Stack stack; // stack of split states (to-do list)
void print_partition( FILE *file ) {
for( size_t i=0; i<posrange.size(); i++ ) {
fprintf(file,"set[%lu] =", (unsigned long)i);
for( size_t k=posrange[i].from; k<posrange[i].to; k++ )
fprintf(file," %lu", (unsigned long)partition[k]);
fputc('\n', file);
}
}
public:
Minimiser( Transducer &t ); // initialises the minimiser
Transducer &result(); // does the minimisation and returns the result
void compute_source_states( Label2SState&, PosRange ); // computes for each
// label l the set of source states with an "l" transition to a state
// in the current split state set
void split( size_t B, vector<size_t> &T1, size_t C ); // splits a state set
// by the state set given in "T1".
};
/*******************************************************************/
/* */
/* Minimiser::Minimiser */
/* */
/*******************************************************************/
Minimiser::Minimiser( Transducer &t )
: transducer(t), transtab(t)
{
// compute the set of final (B1) and non-final (B2) states
vector<size_t> B1, B2;
for( size_t i=0; i<transtab.nodenumbering.number_of_nodes(); i++ )
if (transtab.nodenumbering.get_node(i)->is_final())
B1.push_back(i);
else
B2.push_back(i);
// build the initial partition of states
state2set.resize( transtab.nodenumbering.number_of_nodes() );
size_t n=0;
if (B1.size() > 0) {
for( size_t i=0; i<B1.size(); i++ ) {
partition.push_back(B1[i]);
state2set[B1[i]] = n;
}
posrange.push_back( PosRange(0, partition.size() ) );
n++;
}
size_t pos = partition.size();
if (B2.size() > 0) {
for( size_t i=0; i<B2.size(); i++ ) {
partition.push_back(B2[i]);
state2set[B2[i]] = n;
}
posrange.push_back( PosRange(pos, partition.size() ) );
}
}
/*******************************************************************/
/* */
/* Minimiser::compute_source_states */
/* */
/* compute the backward transitions from states in the split set */
/* */
/*******************************************************************/
void Minimiser::compute_source_states( Label2SState &l2ss, PosRange r )
{
// This function computes for each label l the set of source states
// with a transition to one of the states in the split set which is
// labelled with l
// for all states in B
for( size_t i=r.from; i<r.to; i++ ) {
size_t s = partition[i];
// for all transition into state B
for( size_t k=transtab.startpos[s]; k<transtab.startpos[s+1]; k++ ) {
Transition &t = transtab.transition[k];
// most expensive code line follows here !!!
l2ss[t.label].insert(t.source);
}
}
}
/*******************************************************************/
/* */
/* Minimiser::split */
/* */
/*******************************************************************/
void Minimiser::split( size_t B, vector<size_t> &T1, size_t C )
{
// splits a state set by intersection with the state set "T1"
// get the position range of the states in the "partition" table
PosRange r = posrange[B];
if (r.size() == 1)
return;
vector<size_t> B1_set, B2_set;
#ifdef TEST
fprintf(stderr,"\nsplit %lu with", B);
for( size_t i=0; i<T1.size(); i++ )
fprintf(stderr," %lu", T1[i]);
fputc('\n', stderr);
#endif
// compute the intersection and the difference of the two sets
size_t k=0;
size_t i=r.from;
while (i < r.to) {
if (k == T1.size() || partition[i] < T1[k])
// state is not in the intersection
B2_set.push_back(partition[i++]);
else if (partition[i] == T1[k]) {
// state is in the intersection
B1_set.push_back(partition[i++]);
k++;
}
else
k++;
}
#ifdef TEST
fprintf(stderr,"B1 = ");
for( size_t i=0; i<B1_set.size(); i++ )
fprintf(stderr," %lu", B1_set[i]);
fputc('\n', stderr);
fprintf(stderr,"B2 = ");
for( size_t i=0; i<B2_set.size(); i++ )
fprintf(stderr," %lu", B2_set[i]);
fputc('\n', stderr);
#endif
if (B2_set.size() == 0)
return;
// split the set in "partition"
// copy the first subset to partition
k = r.from;
for( size_t i=0; i<B1_set.size(); i++ )
partition[k++] = B1_set[i];
// update the end position of the reduced state set
posrange[B].to = k;
// copy the second subset to partition
size_t new_B = posrange.size();
posrange.push_back( PosRange(k, k+B2_set.size()) );
for( size_t i=0; i<B2_set.size(); i++ ) {
state2set[B2_set[i]] = new_B;
partition[k++] = B2_set[i];
}
// add the new set to the set of split states
if (B == C) {
stack.push( B );
stack.push( new_B );
}
else if (stack.contains(B))
stack.push( new_B );
else if (B1_set.size() > B2_set.size())
stack.push( new_B );
else
stack.push( B );
#ifdef TEST
fprintf(stderr,"\nnew partition:\n");
print_partition( stderr );
#endif
return;
}
/*******************************************************************/
/* */
/* Minimiser::result */
/* */
/*******************************************************************/
Transducer &Minimiser::result()
{
if (transtab.nodenumbering.number_of_nodes() == 1)
return transducer.copy(); // no need for a minimisation
// initialise the stack of split sets
if (posrange.size() == 2) {
// there are both, final and non-final states
// put the larger one first on the stack
if (posrange[0].to-posrange[0].from > posrange[1].to-posrange[1].from) {
stack.push( 0 );
stack.push( 1 );
}
else {
stack.push( 1 );
stack.push( 0 );
}
}
else
stack.push( 0 );
#ifdef TEST
fprintf(stderr,"\nPartition:\n");
print_partition( stderr );
#endif
// repeat until the stack is empty
while (!stack.is_empty()) {
size_t C = stack.pop(); // next state set on which the others are split
// compute for each label l the set of source states with a transition
// to one of the states in the split set which is labelled with l
Label2SState l2ss;
compute_source_states( l2ss, posrange[C] );
// loop over the labels
for( Label2SState::iterator it=l2ss.begin(); it!= l2ss.end(); it++ ) {
set<size_t> &X = it->second;
// copy the set of source states to a sorted vector
vector<size_t> T1;
T1.reserve(X.size());
for( set<size_t>::iterator it=X.begin(); it!=X.end(); it++ )
T1.push_back(*it);
// find the relevant source state sets
map<size_t,size_t> source_set_counts;
for( size_t i=0; i<T1.size(); i++ ) {
size_t ss = state2set[T1[i]];
map<size_t,size_t>::iterator it = source_set_counts.find( ss );
if (it == source_set_counts.end())
source_set_counts[ ss ] = 1;
else
it->second++;
}
// loop over the source state sets
for( map<size_t,size_t>::iterator it=source_set_counts.begin();
it!=source_set_counts.end(); it++ )
{
size_t B = it->first;
// do not try to split sets with just 1 element
//if (posrange[B].to - posrange[B].from < it->second)
if (posrange[B].to - posrange[B].from > it->second)
split( B, T1, C );
}
}
}
#ifdef TEST
fprintf(stderr,"\nPartition:\n");
print_partition( stderr );
#endif
return *new Transducer( transducer, state2set, transtab.nodenumbering,
posrange.size() );
}
/*******************************************************************/
/* */
/* Transducer::build_TT */
/* */
/*******************************************************************/
void Transducer::build_TT( Node *node, vector<Transition> &transtab,
NodeNumbering &nn )
{
if (!node->was_visited( vmark )) {
size_t sourceID = nn[node];
for( ArcsIter p(node->arcs()); p; p++ ) {
Arc *arc=p;
build_TT( arc->target_node(), transtab, nn );
size_t targetID = nn[arc->target_node()];
transtab.push_back( Transition( sourceID, arc->label(), targetID ) );
}
}
}
/*******************************************************************/
/* */
/* Transducer::build_transtab */
/* */
/*******************************************************************/
void Transducer::build_transtab( vector<Transition> &tt, NodeNumbering &nn )
{
incr_vmark();
build_TT( root_node(), tt, nn );
}
/*******************************************************************/
/* */
/* Transducer::Transducer */
/* */
/* builds the result transducer after minimisation */
/* */
/*******************************************************************/
Transducer::Transducer( Transducer &t, vector<size_t> &state2set,
NodeNumbering &nn, size_t num_of_new_nodes )
{
vmark = 0;
deterministic = minimised = true;
alphabet.copy(t.alphabet);
// Choose a representative for each state set
vector<Node*> representative( num_of_new_nodes );
for( size_t i=0; i<state2set.size(); i++ )
representative[state2set[i]] = nn.get_node(i);
// Create the new nodes
vector<Node*> node(num_of_new_nodes, NULL);
// define the root node
node[state2set[0]] = root_node();
for( size_t i=0; i<node.size(); i++ )
if (node[i] == NULL)
node[i] = new_node();
// Add the transitions
for( size_t i=0; i<node.size(); i++ ) {
Node *old_node = representative[i];
Node *new_node = node[i];
new_node->set_final( old_node->is_final() );
for( ArcsIter p(old_node->arcs()); p; p++ ) {
Arc *arc=p;
// Compute the target node: Map the node to its index,
// the index to its set, and the set to the new node
Node *target = node[state2set[nn[arc->target_node()]]];
// Insert the transition
new_node->add_arc( arc->label(), target, this );
#ifdef TEST
fprintf(stderr,"arc: %lu -%s-> %lu\n", i,
alphabet.write_label(arc->label()),
state2set[nn[arc->target_node()]]);
#endif
}
}
}
/*******************************************************************/
/* */
/* Transducer::minimise */
/* */
/*******************************************************************/
Transducer &Transducer::hopcroft_minimise( bool verbose )
{
if (minimised)
return copy();
Transducer *a1 = &reverse();
Transducer *a2 = &a1->reverse();
delete a1;
a1 = &a2->determinise();
delete a2;
Transducer *result = &Minimiser( *a1 ).result();
delete a1;
result->minimised = true;
result->minimise_alphabet();
return *result;
}
}
/*******************************************************************/
/* */
/* FILE interface.C */
/* MODULE interface */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#include <fstream>
#include <set>
#include "interface.h"
using std::ifstream;
using std::vector;
namespace SFST {
/*******************************************************************/
/* */
/* error */
/* */
/*******************************************************************/
void error( const char *message )
{
cerr << "\nError: " << message << "\naborted.\n";
exit(1);
}
/*******************************************************************/
/* */
/* error2 */
/* */
/*******************************************************************/
void error2( const char *message, char *input )
{
cerr << "\nError: " << message << ": " << input << "\naborted.\n";
exit(1);
}
/*******************************************************************/
/* */
/* Interface::symbol_code */
/* */
/*******************************************************************/
Character Interface::symbol_code( char *symbol )
{
int c=TheAlphabet.symbol2code(symbol);
if (c == EOF)
c = TheAlphabet.add_symbol( symbol );
free(symbol);
return (Character)c;
}
/*******************************************************************/
/* */
/* Interface::character_code */
/* */
/*******************************************************************/
Character Interface::character_code( unsigned int uc )
{
if (TheAlphabet.utf8)
return symbol_code(fst_strdup(int2utf8(uc)));
unsigned char *buffer=(unsigned char*)malloc(2);
buffer[0] = (unsigned char)uc;
buffer[1] = 0;
return symbol_code((char*)buffer);
}
/*******************************************************************/
/* */
/* Interface::add_value */
/* */
/*******************************************************************/
Range *Interface::add_value( Character c, Range *r )
{
Range *result=new Range;
result->character = c;
result->next = r;
return result;
}
/*******************************************************************/
/* */
/* Interface::add_values */
/* */
/*******************************************************************/
Range *Interface::add_values( unsigned int c1, unsigned int c2, Range *r )
{
for( unsigned int c=c2; c>=c1; c-- )
r = add_value(character_code(c), r);
return r;
}
/*******************************************************************/
/* */
/* Interface::append_values */
/* */
/*******************************************************************/
Range *Interface::append_values( Range *r2, Range *r )
{
if (r2 == NULL)
return r;
return add_value(r2->character, append_values(r2->next, r));
}
/*******************************************************************/
/* */
/* Interface::add_var_values */
/* */
/*******************************************************************/
Range *Interface::add_var_values( char *name, Range *r )
{
return append_values(svar_value(name), r);
}
/*******************************************************************/
/* */
/* Interface::in_range */
/* */
/*******************************************************************/
bool Interface::in_range( unsigned int c, Range *r )
{
while (r) {
if (r->character == c)
return true;
r = r->next;
}
return false;
}
/*******************************************************************/
/* */
/* free_values */
/* */
/*******************************************************************/
static void free_values( Range *r )
{
if (r) {
free_values(r->next);
delete r;
}
}
/*******************************************************************/
/* */
/* free_values */
/* */
/*******************************************************************/
static void free_values( Ranges *r )
{
if (r) {
free_values(r->next);
delete r;
}
}
/*******************************************************************/
/* */
/* free_contexts */
/* */
/*******************************************************************/
static void free_contexts( Contexts *c )
{
if (c) {
free_contexts(c->next);
delete c;
}
}
/*******************************************************************/
/* */
/* Interface::copy_values */
/* */
/*******************************************************************/
Range *Interface::copy_values( const Range *r )
{
if (r == NULL)
return NULL;
return add_value( r->character, copy_values(r->next));
}
/*******************************************************************/
/* */
/* Interface::complement_range */
/* */
/*******************************************************************/
Range *Interface::complement_range( Range *r )
{
vector<Character> sym;
for( Range *p=r; p; p=p->next)
sym.push_back( p->character );
free_values( r );
TheAlphabet.complement(sym);
if (sym.size() == 0)
error("Empty character range!");
Range *result=NULL;
for( size_t i=0; i<sym.size(); i++ ) {
Range *tmp = new Range;
tmp->character = sym[i];
tmp->next = result;
result = tmp;
}
return result;
}
/*******************************************************************/
/* */
/* Interface::make_transducer */
/* */
/*******************************************************************/
Transducer *Interface::make_transducer( Range *r1, Range *r2 )
{
Transducer *t=new Transducer();
Node *node=t->new_node();
node->set_final(1);
if (r1 == NULL || r2 == NULL) {
if (!Alphabet_Defined)
error("The wildcard symbol '.' requires the definition of an alphabet");
// one of the ranges was '.'
for(Alphabet::const_iterator it=TheAlphabet.begin();
it!=TheAlphabet.end(); it++)
if ((r1 == NULL || in_range(it->lower_char(), r1)) &&
(r2 == NULL || in_range(it->upper_char(), r2)))
t->root_node()->add_arc( *it, node, t );
}
else {
for (;;) {
Label l(r1->character, r2->character);
// TheAlphabet.insert(l);
t->root_node()->add_arc( l, node, t );
if (!r1->next && !r2->next)
break;
if (r1->next)
r1 = r1->next;
if (r2->next)
r2 = r2->next;
}
}
return t;
}
/*******************************************************************/
/* */
/* Interface::empty_transducer */
/* */
/*******************************************************************/
Transducer *Interface::empty_transducer()
{
Transducer *t=new Transducer();
t->root_node()->set_final(1);
return t;
}
/*******************************************************************/
/* */
/* Interface::one_label_transducer */
/* */
/*******************************************************************/
Transducer *Interface::one_label_transducer( Label l )
{
Transducer *t = new Transducer();
Node *last = t->new_node();
t->root_node()->add_arc( l, last, t );
last->set_final(1);
return t;
}
/*******************************************************************/
/* */
/* Interface::new_transducer */
/* */
/*******************************************************************/
Transducer *Interface::new_transducer( Range *r1, Range *r2 )
{
Transducer *t=make_transducer( r1, r2);
if (r1 != r2)
free_values(r1);
free_values(r2);
return t;
}
/*******************************************************************/
/* */
/* Interface::read_words */
/* */
/*******************************************************************/
Transducer *Interface::read_words( char *filename )
{
if (Verbose)
fprintf(stderr,"\nreading words from %s...", filename);
ifstream is(filename);
if (!is.is_open()) {
static char message[1000];
sprintf(message,"Error: Cannot open file \"%s\"!", filename);
throw message;
}
free( filename );
Transducer *t = new Transducer(is, &TheAlphabet, Verbose);
is.close();
TheAlphabet.insert_symbols(t->alphabet);
if (Verbose)
fprintf(stderr,"finished\n");
return t;
}
/*******************************************************************/
/* */
/* Interface::read_transducer */
/* */
/*******************************************************************/
Transducer *Interface::read_transducer( char *filename )
{
if (Verbose)
fprintf(stderr,"\nreading transducer from %s...", filename);
FILE *file = fopen(filename,"rb");
if (file == NULL) {
static char message[1000];
sprintf(message,"Error: Cannot open file \"%s\"!",filename);
throw message;
}
Transducer t(file);
fclose(file);
if (t.alphabet.utf8 != TheAlphabet.utf8) {
static char message[1000];
sprintf(message,"Error: incompatible character encoding in file \"%s\"!",
filename);
throw message;
}
free( filename );
Transducer *nt = &t.copy(false, &TheAlphabet);
TheAlphabet.insert_symbols(nt->alphabet);
if (Verbose)
fprintf(stderr,"finished\n");
return nt;
}
/*******************************************************************/
/* */
/* Interface::def_alphabet */
/* */
/*******************************************************************/
void Interface::def_alphabet( Transducer *t )
{
t = explode(t);
t = minimise(t);
t->alphabet.clear_char_pairs();
t->complete_alphabet();
TheAlphabet.clear_char_pairs();
TheAlphabet.copy(t->alphabet);
Alphabet_Defined = 1;
delete t;
}
/*******************************************************************/
/* */
/* Interface::def_svar definition of a value range variable */
/* */
/*******************************************************************/
bool Interface::def_svar( char *name, Range *r )
{
// delete the old value of the variable
SVarMap::iterator it=SVM.find(name);
if (it != SVM.end()) {
char *n=it->first;
Range *v=it->second;
SVM.erase(it);
delete v;
free(n);
}
SVM[name] = r;
return r == NULL;
}
/*******************************************************************/
/* */
/* Interface::svar_value */
/* */
/*******************************************************************/
Range *Interface::svar_value( char *name )
{
SVarMap::iterator it=SVM.find(name);
if (it == SVM.end())
error2("undefined variable", name);
free(name);
return copy_values(it->second);
}
/*******************************************************************/
/* */
/* Interface::rsvar_value */
/* */
/*******************************************************************/
Range *Interface::rsvar_value( char *name )
{
if (RSS.find(name) == RSS.end())
RSS.insert(fst_strdup(name));
return add_value(symbol_code(name), NULL);
}
/*******************************************************************/
/* */
/* Interface::def_var definition of a transducer variable */
/* */
/*******************************************************************/
bool Interface::def_var( char *name, Transducer *t )
{
// delete the old value of the variable
VarMap::iterator it=VM.find(name);
if (it != VM.end()) {
char *n=it->first;
Transducer *v=it->second;
VM.erase(it);
delete v;
free(n);
}
t = explode(t);
t = minimise(t);
VM[name] = t;
return t->is_empty();
}
/*******************************************************************/
/* */
/* Interface::def_rvar */
/* definition of an agreement variable for automata */
/* */
/*******************************************************************/
bool Interface::def_rvar( char *name, Transducer *t )
{
if (t->is_cyclic())
error2("cyclic transducer assigned to", name);
return def_var( name, t );
}
/*******************************************************************/
/* */
/* Interface::var_value */
/* */
/*******************************************************************/
Transducer *Interface::var_value( char *name )
{
VarMap::iterator it=VM.find(name);
if (it == VM.end())
error2("undefined variable", name);
free(name);
return &(it->second->copy());
}
/*******************************************************************/
/* */
/* Interface::rvar_value */
/* */
/*******************************************************************/
Transducer *Interface::rvar_value( char *name )
{
if (RS.find(name) == RS.end())
RS.insert(fst_strdup(name));
Range *r=add_value(symbol_code(name), NULL);
return new_transducer(r,r);
}
/*******************************************************************/
/* */
/* Interface::explode */
/* */
/*******************************************************************/
Transducer *Interface::explode( Transducer *t )
{
if (RS.size() == 0 && RSS.size() == 0)
return t;
t = minimise(t);
vector<char*> name;
for( RVarSet::iterator it=RS.begin(); it!=RS.end(); it++)
name.push_back(*it);
RS.clear();
// replace all agreement variables
for( size_t i=0; i<name.size(); i++ ) {
Transducer *nt = NULL;
Label l((Character)TheAlphabet.symbol2code(name[i]));
Transducer *vt=var_value(name[i]);
// enumerate all paths of the transducer
vector<Transducer*> it;
vt->enumerate_paths(it);
delete vt;
// insert each path
for( size_t i=0; i<it.size(); i++ ) {
// insertion
Transducer *t1 = &t->splice(l, it[i]);
delete it[i];
if (nt == NULL)
nt = t1;
else
nt = disjunction(nt, t1);
}
delete t;
t = nt;
}
name.clear();
for( RVarSet::iterator it=RSS.begin(); it!=RSS.end(); it++)
name.push_back(*it);
RSS.clear();
// replace all agreement variables
for( size_t i=0; i<name.size(); i++ ) {
Transducer *nt = NULL;
Character c=(Character)TheAlphabet.symbol2code(name[i]);
Range *r=svar_value(name[i]);
// insert each character
while (r != NULL) {
// insertion
Transducer *t1 = &t->replace_char(c, r->character);
if (nt == NULL)
nt = t1;
else
nt = disjunction(nt, t1);
Range *next = r->next;
delete r;
r = next;
}
delete t;
t = nt;
}
return t;
}
/*******************************************************************/
/* */
/* Interface::catenate */
/* */
/*******************************************************************/
Transducer *Interface::catenate( Transducer *t1, Transducer *t2 )
{
Transducer *t = &(*t1 + *t2);
delete t1;
delete t2;
return t;
}
/*******************************************************************/
/* */
/* Interface::add_range */
/* */
/*******************************************************************/
Ranges *Interface::add_range( Range *r, Ranges *l )
{
Ranges *result = new Ranges;
result->range = r;
result->next = l;
return result;
}
/*******************************************************************/
/* */
/* Interface::make_mapping */
/* */
/*******************************************************************/
Transducer *Interface::make_mapping( Ranges *list1, Ranges *list2 )
{
Ranges *l1=list1;
Ranges *l2=list2;
Transducer *t=new Transducer();
Node *node=t->root_node();
while (l1 && l2) {
Node *nn=t->new_node();
for( Range *r1=l1->range; r1; r1=r1->next )
for( Range *r2=l2->range; r2; r2=r2->next )
node->add_arc( Label(r1->character, r2->character), nn, t );
node = nn;
l1 = l1->next;
l2 = l2->next;
}
while (l1) {
Node *nn=t->new_node();
for( Range *r1=l1->range; r1; r1=r1->next )
node->add_arc( Label(r1->character, Label::epsilon), nn, t );
node = nn;
l1 = l1->next;
}
while (l2) {
Node *nn=t->new_node();
for( Range *r2=l2->range; r2; r2=r2->next )
node->add_arc( Label(Label::epsilon, r2->character), nn, t );
node = nn;
l2 = l2->next;
}
node->set_final(1);
free_values(list1);
free_values(list2);
return t;
}
/*******************************************************************/
/* */
/* Interface::disjunction */
/* */
/*******************************************************************/
Transducer *Interface::disjunction( Transducer *t1, Transducer *t2 )
{
Transducer *t = &(*t1 | *t2);
delete t1;
delete t2;
return t;
}
/*******************************************************************/
/* */
/* Interface::conjunction */
/* */
/*******************************************************************/
Transducer *Interface::conjunction( Transducer *t1, Transducer *t2 )
{
if (RS.size() > 0 || RSS.size() > 0)
cerr << "\nWarning: agreement operation inside of conjunction!\n";
Transducer *t = &(*t1 & *t2);
delete t1;
delete t2;
return t;
}
/*******************************************************************/
/* */
/* Interface::subtraction */
/* */
/*******************************************************************/
Transducer *Interface::subtraction( Transducer *t1, Transducer *t2 )
{
if (RS.size() > 0 || RSS.size() > 0)
cerr << "\nWarning: agreement operation inside of conjunction!\n";
Transducer *t = &(*t1 / *t2);
delete t1;
delete t2;
return t;
}
/*******************************************************************/
/* */
/* Interface::composition */
/* */
/*******************************************************************/
Transducer *Interface::composition( Transducer *t1, Transducer *t2 )
{
if (RS.size() > 0 || RSS.size() > 0)
cerr << "\nWarning: agreement operation inside of composition!\n";
Transducer *t = &(*t1 || *t2);
delete t1;
delete t2;
return t;
}
/*******************************************************************/
/* */
/* Interface::freely_insert */
/* */
/*******************************************************************/
Transducer *Interface::freely_insert( Transducer *t,
Character lc, Character uc )
{
return &t->freely_insert(Label(lc,uc));
}
/*******************************************************************/
/* */
/* Interface::negation */
/* */
/*******************************************************************/
Transducer *Interface::negation( Transducer *t )
{
if (RS.size() > 0 || RSS.size() > 0)
cerr << "\nWarning: agreement operation inside of negation!\n";
if (!Alphabet_Defined)
error("Negation requires the definition of an alphabet");
t->alphabet.clear_char_pairs();
t->alphabet.copy(TheAlphabet);
Transducer *nt = &(!*t);
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::upper_level */
/* */
/*******************************************************************/
Transducer *Interface::upper_level( Transducer *t )
{
Transducer *nt = &t->upper_level();
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::lower_level */
/* */
/*******************************************************************/
Transducer *Interface::lower_level( Transducer *t )
{
Transducer *nt = &t->lower_level();
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::minimise */
/* */
/*******************************************************************/
Transducer *Interface::minimise( Transducer *t )
{
t->alphabet.copy(TheAlphabet);
Transducer *nt = &t->minimise( Verbose );
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::switch_levels */
/* */
/*******************************************************************/
Transducer *Interface::switch_levels( Transducer *t )
{
Transducer *nt = &t->switch_levels();
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::repetition */
/* */
/*******************************************************************/
Transducer *Interface::repetition( Transducer *t )
{
Transducer *nt = &(t->kleene_star());
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::repetition2 */
/* */
/*******************************************************************/
Transducer *Interface::repetition2( Transducer *t )
{
Transducer *t1 = &(t->kleene_star());
Transducer *nt = &(*t + *t1);
delete t;
delete t1;
return nt;
}
/*******************************************************************/
/* */
/* Interface::optional */
/* */
/*******************************************************************/
Transducer *Interface::optional( Transducer *t )
{
Transducer *nt = &(t->copy());
nt->root_node()->set_final(1);
delete t;
return nt;
}
/*******************************************************************/
/* */
/* Interface::add_pi_transitions */
/* */
/*******************************************************************/
void Interface::add_pi_transitions( Transducer *t, Node *node, Alphabet &alph)
{
for( Alphabet::const_iterator it=alph.begin(); it!=alph.end(); it++)
node->add_arc( *it, node, t );
}
/*******************************************************************/
/* */
/* Interface::pi_machine */
/* */
/*******************************************************************/
Transducer *Interface::pi_machine( Alphabet &alph )
{
Transducer *t=new Transducer();
t->root_node()->set_final(1);
add_pi_transitions( t, t->root_node(), alph );
return t;
}
/*******************************************************************/
/* */
/* Interface::cp */
/* */
/*******************************************************************/
Transducer *Interface::cp( Range *lower_range, Range *upper_range )
{
return make_transducer(lower_range, upper_range);
}
/*******************************************************************/
/* */
/* Interface::anti_cp */
/* */
/*******************************************************************/
Transducer *Interface::anti_cp( Range *lower_range, Range *upper_range )
{
Transducer *cpt = cp(lower_range, upper_range);
Transducer *t=new Transducer();
Node *node=t->new_node();
node->set_final(1);
for(Alphabet::const_iterator it=TheAlphabet.begin();
it!=TheAlphabet.end(); it++){
Label l=*it;
if (in_range(l.lower_char(), lower_range) &&
!cpt->root_node()->target_node(l))
t->root_node()->add_arc( l, node, t );
}
if (in_range(Label::epsilon, lower_range) &&
!cpt->root_node()->target_node(Label()))
t->root_node()->add_arc( Label(), node, t );
delete cpt;
return t;
}
/*******************************************************************/
/* */
/* Interface::twol_right_rule */
/* */
/*******************************************************************/
Transducer *Interface::twol_right_rule( Transducer *lc, Range *lower_range,
Range *upper_range, Transducer *rc )
{
// Build the rule transducer
Transducer *cpt = cp(lower_range, upper_range);
Transducer *pi=pi_machine(TheAlphabet);
// First unwanted language
lc->alphabet.copy(TheAlphabet);
Transducer *notlc = &(!*lc);
Transducer *tmp = &(*notlc + *cpt);
delete notlc;
Transducer *t1 = &(*tmp + *pi);
delete tmp;
// Second unwanted language
rc->alphabet.copy(TheAlphabet);
Transducer *notrc = &(!*rc);
tmp = &(*cpt + *notrc);
delete cpt;
delete notrc;
Transducer *t2 = &(*pi + *tmp);
delete pi;
delete tmp;
tmp = &(*t1|*t2);
delete t1;
delete t2;
tmp->alphabet.copy(TheAlphabet);
t1 = &(!*tmp);
delete tmp;
return t1;
}
/*******************************************************************/
/* */
/* Interface::twol_left_rule */
/* */
/*******************************************************************/
Transducer *Interface::twol_left_rule( Transducer *lc, Range *lower_range,
Range *upper_range, Transducer *rc )
{
// check for problematic insertion operations like "$L <> <= a $R"
// where either $L or $R includes the empty string
if (in_range(Label::epsilon, lower_range)) {
if (lc->generates_empty_string())
error("in two level rule: insertion operation with deletable left context!");
if (rc->generates_empty_string())
error("in two level rule: insertion operation with deletable right context!");
cerr << "\nWarning: two level rule used for insertion operation (might produce unexpected results)\n";
}
// Build the rule transducer
Transducer *t1 = anti_cp(lower_range, upper_range);
// Add the left context;
Transducer *t2 = &(*lc + *t1);
delete t1;
// Add the right context;
t1 = &(*t2 + *rc);
delete t2;
// Form the complement
t1->alphabet.copy(TheAlphabet);
t2 = &(!*t1);
delete t1;
return t2;
}
/*******************************************************************/
/* */
/* Interface::make_rule */
/* */
/*******************************************************************/
Transducer *Interface::make_rule( Transducer *lc, Range *lower_range,
Twol_Type type, Range *upper_range,
Transducer *rc )
{
if (RS.size() > 0 || RSS.size() > 0)
cerr << "\nWarning: agreement operation inside of replacement rule!\n";
if (!Alphabet_Defined)
error("Two level rules require the definition of an alphabet");
// expand the left and the right contexts to their full length
Transducer *pi=pi_machine(TheAlphabet);
if (lc == NULL)
lc = pi_machine(TheAlphabet);
else {
Transducer *tmp = &(*pi + *lc);
delete lc;
lc = tmp;
}
if (rc == NULL)
rc = pi_machine(TheAlphabet);
else {
Transducer *tmp = &(*rc + *pi);
delete rc;
rc = tmp;
}
delete pi;
Transducer *result = NULL;
switch (type) {
case twol_left:
result = twol_left_rule(lc, lower_range, upper_range, rc);
break;
case twol_right:
result = twol_right_rule(lc, lower_range, upper_range, rc);
break;
case twol_both:
{
Transducer *t1 = twol_left_rule(lc, lower_range, upper_range, rc);
Transducer *t2 = twol_right_rule(lc, lower_range, upper_range, rc);
result = &(*t1 & *t2);
delete t1;
delete t2;
}
}
delete lc;
delete rc;
if (lower_range != upper_range)
free_values(lower_range);
free_values(upper_range);
return minimise(result);
}
/*******************************************************************/
/* */
/* Interface::make_context */
/* */
/*******************************************************************/
Contexts *Interface::make_context( Transducer *l, Transducer *r )
{
if (l == NULL)
l = empty_transducer();
if (r == NULL)
r = empty_transducer();
Contexts *c=new Contexts();
c->left = l;
c->right = r;
c->next = NULL;
return c;
}
/*******************************************************************/
/* */
/* Interface::add_context */
/* */
/*******************************************************************/
Contexts *Interface::add_context( Contexts *nc, Contexts *c )
{
nc->next = c;
return nc;
}
/*******************************************************************/
/* */
/* Interface::restriction_transducer */
/* */
/*******************************************************************/
Transducer *Interface::restriction_transducer( Transducer *l1, Transducer *l2,
Character marker )
{
l1->alphabet.copy(TheAlphabet);
Transducer *t1 = &(*l1 / *l2);
Transducer *t2 = &t1->replace_char(marker, Label::epsilon);
delete t1;
t2->alphabet.copy(TheAlphabet);
t1 = &(!*t2);
delete t2;
return t1;
}
/*******************************************************************/
/* */
/* Interface::marker_transducer */
/* */
/*******************************************************************/
Transducer *Interface::marker_transducer( Transducer *t, Contexts *c,
Character &marker )
{
marker = TheAlphabet.new_marker();
Transducer *result = one_label_transducer( Label(marker) );
// build the alphabet with a new marker
result->alphabet.insert_symbols(t->alphabet);
while (c) {
result->alphabet.insert_symbols(c->left->alphabet);
result->alphabet.insert_symbols(c->right->alphabet);
c = c->next;
}
return result;
}
/*******************************************************************/
/* */
/* Interface::center_transducer */
/* */
/*******************************************************************/
Transducer *Interface::center_transducer( Transducer *t, Transducer *pi,
Transducer *mt )
{
// create the concatenation pi + mt + *t + mt + pi
Transducer *t1=&(*pi + *mt);
Transducer *t2=&(*t1 + *t);
delete t1;
t1 = &(*t2 + *mt);
delete t2;
t2 = &(*t1 + *pi);
delete t1;
return t2;
}
/*******************************************************************/
/* */
/* Interface::context_transducer */
/* */
/*******************************************************************/
Transducer *Interface::context_transducer( Transducer *t, Transducer *pi,
Transducer *mt, Contexts *c )
{
// pi + left[i] + mt + pi + mt + right[i] + pi
Transducer *t1 = &(*mt + *t);
Transducer *tmp = &(*t1 + *mt);
delete t1;
Transducer *result=NULL;
while (c) {
t1 = &(*pi + *c->left);
Transducer *t2 = &(*t1 + *tmp);
delete t1;
t1 = &(*t2 + *c->right);
delete t2;
t2 = &(*t1 + *pi);
delete t1;
if (result) {
t1 = &(*result | *t2);
delete t2;
result = t1;
}
else
result = t2;
c = c->next;
}
delete tmp;
return result;
}
/*******************************************************************/
/* */
/* Interface::result_transducer */
/* */
/*******************************************************************/
Transducer *Interface::result_transducer( Transducer *l1, Transducer *l2,
Twol_Type type, Character marker )
{
Transducer *result=NULL;
if (type == twol_right)
result = restriction_transducer( l1, l2, marker );
else if (type == twol_left)
result = restriction_transducer( l2, l1, marker );
else if (type == twol_both) {
Transducer *t1 = restriction_transducer( l1, l2, marker );
Transducer *t2 = restriction_transducer( l2, l1, marker );
result = &(*t1 & *t2);
delete t1;
delete t2;
}
return result;
}
/*******************************************************************/
/* */
/* Interface::restriction */
/* */
/*******************************************************************/
Transducer *Interface::restriction( Transducer *t, Twol_Type type,
Contexts *c, int direction )
{
Character marker;
Transducer *mt=marker_transducer( t, c, marker );
Transducer *pi=pi_machine(TheAlphabet);
Transducer *l1=center_transducer( t, pi, mt );
Transducer *tmp;
if (direction == 0)
tmp = pi;
else if (direction == 1) {
// compute _t || .*
Transducer *t1 = &t->lower_level();
tmp = &(*t1 || *pi);
delete t1;
}
else {
// compute ^t || .*
Transducer *t1 = &t->upper_level();
tmp = &(*pi || *t1);
delete t1;
}
delete t;
Transducer *l2=context_transducer( tmp, pi, mt, c );
if (tmp != pi)
delete tmp;
delete pi;
delete mt;
Transducer *result=result_transducer( l1, l2, type, marker );
delete l1;
delete l2;
free_contexts( c );
return result;
}
/*******************************************************************/
/* */
/* Interface::constrain_boundary_transducer */
/* */
/*******************************************************************/
Transducer *Interface::constrain_boundary_transducer( Character leftm,
Character rightm )
{
// create the transducer (.|<L>|<R>)*
Transducer *tmp=pi_machine(TheAlphabet);
// create the transducer (.|<L>|<R>)* <L><R> (.|<L>|<R>)*
Node *root = tmp->root_node();
Node *node = tmp->new_node();
Node *last = tmp->new_node();
root->set_final(0);
last->set_final(1);
root->add_arc( Label(leftm), node, tmp);
node->add_arc( Label(rightm), last, tmp);
add_pi_transitions( tmp, last, TheAlphabet );
// create the transducer !((.|<L>|<R>)* <L><R> (.|<L>|<R>)*)
tmp->alphabet.copy(TheAlphabet);
Transducer *result = &(!*tmp);
delete tmp;
return result;
}
/*******************************************************************/
/* */
/* Interface::extended_left_transducer */
/* */
/*******************************************************************/
Transducer *Interface::extended_left_transducer( Transducer *t,
Character m1, Character m2 )
{
if (t == NULL) // empty context
return pi_machine(TheAlphabet);
// Extended left context transducer
// <R> >> (<L> >> $T$)
Transducer *tmp=&t->freely_insert( Label(m1) );
delete t;
t = &tmp->freely_insert( Label(m2) );
delete tmp;
// .* (<R> >> (<L> >> $T$))
add_pi_transitions( t, t->root_node(), TheAlphabet );
// !(.*<L>)
tmp = one_label_transducer(Label(m1));
add_pi_transitions( tmp, tmp->root_node(), TheAlphabet );
tmp->alphabet.copy(TheAlphabet);
Transducer *t2 = &(!*tmp);
delete tmp;
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
tmp = &(*t || *t2);
delete t;
delete t2;
return tmp;
}
/*******************************************************************/
/* */
/* Interface::left_context */
/* */
/*******************************************************************/
Transducer *Interface::left_context( Transducer *t,
Character m1, Character m2 )
{
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
Transducer *ct = extended_left_transducer(t, m1, m2);
// <R>* <L> .*
Transducer *mt = one_label_transducer(Label(m1));
mt->root_node()->add_arc(Label(m2), mt->root_node(), mt );
add_pi_transitions(mt, mt->root_node()->target_node(Label(m1)),TheAlphabet);
ct->alphabet.copy(TheAlphabet);
Transducer *no_ct = &!*ct;
mt->alphabet.copy(TheAlphabet);
Transducer *no_mt = &!*mt;
{
static int print=1;
if (print) {
print = 0;
Transducer *temp = &(ct->copy());
temp = &(no_ct->copy());
temp = &(mt->copy());
temp = &(no_mt->copy());
}
}
Transducer *t1 = &(*no_ct + *mt);
delete no_ct;
delete mt;
Transducer *t2 = &(*ct + *no_mt);
delete ct;
delete no_mt;
Transducer *tmp = &(*t1 | *t2);
delete t1;
delete t2;
tmp->alphabet.copy(TheAlphabet);
t1 = &!*tmp;
delete tmp;
return t1;
}
/*******************************************************************/
/* */
/* Interface::make_optional */
/* */
/*******************************************************************/
Transducer *Interface::make_optional( Transducer *t )
{
Transducer *t1 = pi_machine(TheAlphabet);
Transducer *t2 = &(*t | *t1);
delete t;
delete t1;
return t2;
}
/*******************************************************************/
/* */
/* Interface::replace */
/* */
/*******************************************************************/
Transducer *Interface::replace( Transducer *ct, Repl_Type type,
bool optional )
{
// compute the no-center transducer
Transducer *tmp=NULL;
if (type == repl_up)
// _ct
tmp = &ct->lower_level();
else if (type == repl_down)
// ^ct
tmp = &ct->upper_level();
else
error("Invalid type of replace operator");
// .* _ct
add_pi_transitions( tmp, tmp->root_node(), TheAlphabet );
// .* _ct .*
Transducer *t2 = pi_machine(TheAlphabet);
Transducer *t3 = &(*tmp + *t2);
delete tmp;
delete t2;
// no_ct = !(.* _ct .*)
t3->alphabet.copy(TheAlphabet);
Transducer *no_ct = &(!*t3);
delete t3;
// compute the unconditional replacement transducer
// no-ct ct
tmp = &(*no_ct + *ct);
delete ct;
// (no-ct ct)*
t2 = &(tmp->kleene_star());
delete tmp;
// (no-ct ct)* no-ct
tmp = &(*t2 + *no_ct);
delete t2;
delete no_ct;
if (optional)
tmp = make_optional(tmp);
return tmp;
}
/*******************************************************************/
/* */
/* Interface::replace_transducer */
/* */
/*******************************************************************/
Transducer *Interface::replace_transducer( Transducer *ct, Character lm,
Character rm, Repl_Type type )
{
// insert boundary markers into the center transducer
// <L> >> (<R> >> $Center$)
Transducer *tmp = &ct->freely_insert(Label(lm));
delete ct;
ct = &tmp->freely_insert(Label(rm));
delete tmp;
// add surrounding boundary markers to the center transducer
// <L> (<L> >> (<R> >> $Center$))
Transducer *t2 = one_label_transducer( Label(lm) );
tmp = &(*t2 + *ct);
delete t2;
delete ct;
// $CenterB$ = <L> (<L> >> (<R> >> $Center$)) <R>
t2 = one_label_transducer( Label(rm) );
ct = &(*tmp + *t2);
delete tmp;
delete t2;
return replace(ct, type, false);
}
/*******************************************************************/
/* */
/* Interface::replace_in_context */
/* */
/*******************************************************************/
Transducer *Interface::replace_in_context( Transducer *t, Repl_Type type,
Contexts *c, bool optional )
{
// The implementation of the replace operators is based on
// "The Replace Operator" by Lauri Karttunen
if (!Alphabet_Defined)
error("The replace operators require the definition of an alphabet");
if (!c->left->is_automaton() || !c->right->is_automaton())
error("The replace operators require automata as context expressions!");
// create the marker symbols
Character leftm = TheAlphabet.new_marker();
Character rightm = TheAlphabet.new_marker();
/////////////////////////////////////////////////////////////
// Create the insert boundaries transducer (.|<>:<L>|<>:<R>)*
/////////////////////////////////////////////////////////////
Transducer *ibt=pi_machine(TheAlphabet);
Node *root=ibt->root_node();
root->add_arc( Label(Label::epsilon, leftm), root, ibt);
root->add_arc( Label(Label::epsilon, rightm),root, ibt);
/////////////////////////////////////////////////////////////
// Create the remove boundaries transducer (.|<L>:<>|<R>:<>)*
/////////////////////////////////////////////////////////////
Transducer *rbt=pi_machine(TheAlphabet);
root = rbt->root_node();
root->add_arc( Label(leftm, Label::epsilon), root, rbt);
root->add_arc( Label(rightm,Label::epsilon), root, rbt);
// Add the markers to the alphabet
TheAlphabet.insert(Label(leftm));
TheAlphabet.insert(Label(rightm));
/////////////////////////////////////////////////////////////
// Create the constrain boundaries transducer !(.*<L><R>.*)
/////////////////////////////////////////////////////////////
Transducer *cbt=constrain_boundary_transducer(leftm, rightm);
/////////////////////////////////////////////////////////////
// Create the extended context transducers
/////////////////////////////////////////////////////////////
// left context transducer: .* (<R> >> (<L> >> $T$)) || !(.*<L>)
Transducer *lct = left_context(c->left, leftm, rightm);
// right context transducer: (<R> >> (<L> >> $T$)) .* || !(<R>.*)
Transducer *tmp = &c->right->reverse();
delete c->right;
Transducer *t2 = left_context(tmp, rightm, leftm);
Transducer *rct = &t2->reverse();
delete t2;
/////////////////////////////////////////////////////////////
// unconditional replace transducer
/////////////////////////////////////////////////////////////
Transducer *rt;
if (type == repl_up || type == repl_right || type == repl_left)
rt = replace_transducer( t, leftm, rightm, repl_up );
else
rt = replace_transducer( t, leftm, rightm, repl_down );
/////////////////////////////////////////////////////////////
// build the conditional replacement transducer
/////////////////////////////////////////////////////////////
tmp = &(ibt->copy());
tmp = &(cbt->copy());
tmp = &(lct->copy());
tmp = &(rct->copy());
tmp = &(rt->copy());
tmp = &(rbt->copy());
tmp = ibt;
tmp = &(*ibt || *cbt);
delete(ibt);
delete(cbt);
if (type == repl_up || type == repl_left) {
t2 = &(*tmp || *lct);
delete tmp;
delete lct;
tmp = t2;
}
if (type == repl_up || type == repl_right) {
t2 = &(*tmp || *rct);
delete tmp;
delete rct;
tmp = t2;
}
t2 = &(*tmp || *rt);
delete tmp;
delete rt;
tmp = t2;
if (type == repl_down || type == repl_right) {
t2 = &(*tmp || *lct);
delete tmp;
delete lct;
tmp = t2;
}
if (type == repl_down || type == repl_left) {
t2 = &(*tmp || *rct);
delete tmp;
delete rct;
tmp = t2;
}
t2 = &(*tmp || *rbt);
delete tmp;
delete rbt;
// Remove the markers from the alphabet
TheAlphabet.delete_markers();
if (optional)
t2 = make_optional(t2);
free_contexts( c );
return t2;
}
/*******************************************************************/
/* */
/* Interface::add_alphabet */
/* */
/*******************************************************************/
void Interface::add_alphabet( Transducer *t )
{
t->alphabet.copy(TheAlphabet);
t->complete_alphabet();
}
/*******************************************************************/
/* */
/* Interface::write_to_file */
/* */
/*******************************************************************/
void Interface::write_to_file( Transducer *t, char *filename)
{
FILE *file;
if ((file = fopen(filename,"wb")) == NULL) {
fprintf(stderr,"\nError: Cannot open output file \"%s\"\n\n", filename);
exit(1);
}
free( filename );
t = explode(t);
add_alphabet(t);
t = minimise(t);
t->store(file);
fclose(file);
}
/*******************************************************************/
/* */
/* Interface::result */
/* */
/*******************************************************************/
Transducer *Interface::result( Transducer *t, bool switch_flag )
{
t = explode(t);
// delete the variable values
vector<char*> s;
for( VarMap::iterator it=VM.begin(); it != VM.end(); it++ ) {
s.push_back(it->first);
delete it->second;
it->second = NULL;
}
VM.clear();
for( size_t i=0; i<s.size(); i++ )
free(s[i]);
s.clear();
if (switch_flag)
t = switch_levels(t);
add_alphabet(t);
t = minimise(t);
return t;
}
}
/*******************************************************************/
/* */
/* FILE interface.h */
/* MODULE interface */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#ifndef _INTERFACE_H_
#define _INTERFACE_H_
#include "utf8.h"
#include "fst.h"
using std::set;
using std::cerr;
namespace SFST {
void error( const char *message );
void error2( const char *message, char *input );
typedef enum {twol_left,twol_right,twol_both} Twol_Type;
typedef enum {repl_left,repl_right,repl_up,repl_down} Repl_Type;
typedef struct range_t {
Character character;
struct range_t *next;
} Range;
typedef struct ranges_t {
Range *range;
struct ranges_t *next;
} Ranges;
typedef struct contexts_t {
Transducer *left, *right;
struct contexts_t *next;
} Contexts;
class Interface {
private:
struct ltstr {
bool operator()(const char* s1, const char* s2) const
{ return strcmp(s1, s2) < 0; }
};
struct eqstr {
bool operator()(const char* s1, const char* s2) const
{ return strcmp(s1, s2) == 0; }
};
typedef set<char*, ltstr> RVarSet;
typedef hash_map<char*, Transducer*, hash<const char*>, eqstr> VarMap;
typedef hash_map<char*, Range*, hash<const char*>, eqstr> SVarMap;
Range *copy_values( const Range *r );
Transducer *empty_transducer();
Transducer *one_label_transducer( Label l );
void add_pi_transitions( Transducer *t, Node *node, Alphabet &alph );
Transducer *pi_machine( Alphabet &alph );
Transducer *cp( Range *lower_range, Range *upper_range );
Transducer *anti_cp( Range *lower_range, Range *upper_range );
Transducer *twol_right_rule( Transducer *lc, Range *lower_range,
Range *upper_range, Transducer *rc);
Transducer *twol_left_rule( Transducer *lc, Range *lower_range,
Range *upper_range, Transducer *rc );
Transducer *restriction_transducer( Transducer *l1, Transducer *l2,
Character marker );
Transducer *marker_transducer( Transducer *t, Contexts *c,
Character &marker );
Transducer *center_transducer( Transducer *t, Transducer *pi,
Transducer *mt );
Transducer *context_transducer( Transducer *t, Transducer *pi,
Transducer *mt, Contexts *c );
Transducer *constrain_boundary_transducer( Character leftm, Character rm );
Transducer *extended_left_transducer( Transducer *t,
Character m1, Character m2 );
Transducer *left_context( Transducer *t, Character m1, Character m2 );
Transducer *make_optional( Transducer *t );
Transducer *replace_transducer( Transducer *ct, Character lm,
Character rm, Repl_Type type );
Transducer *result_transducer( Transducer *l1, Transducer *l2,
Twol_Type type, Character marker );
VarMap VM;
SVarMap SVM;
RVarSet RS;
RVarSet RSS;
public:
bool Verbose;
bool Alphabet_Defined;
Alphabet TheAlphabet;
Interface( bool utf8=false, bool verbose=false ) :
Verbose(verbose), Alphabet_Defined(false)
{
TheAlphabet.utf8 = utf8;
}
Transducer *new_transducer( Range*, Range* );
Transducer *read_words( char *filename );
Transducer *read_transducer( char *filename );
Transducer *var_value( char *name );
Transducer *rvar_value( char *name );
Range *svar_value( char *name );
Range *complement_range( Range* );
Range *rsvar_value( char *name );
Character character_code( unsigned int uc );
Character symbol_code( char *s );
bool in_range( unsigned int c, Range *r );
Transducer *make_transducer( Range *r1, Range *r2 );
Range *add_value( Character, Range*);
Range *add_var_values( char *name, Range*);
Range *add_values( unsigned int, unsigned int, Range*);
Range *append_values( Range *r2, Range *r );
void add_alphabet( Transducer* );
// These functions delete their argument automata
void def_alphabet( Transducer *a );
bool def_var( char *name, Transducer *a );
bool def_rvar( char *name, Transducer *a );
bool def_svar( char *name, Range *r );
Transducer *explode( Transducer *a );
Transducer *catenate( Transducer *a1, Transducer *a2 );
Transducer *disjunction( Transducer *a1, Transducer *a2 );
Transducer *conjunction( Transducer *a1, Transducer *a2 );
Transducer *subtraction( Transducer *a1, Transducer *a2 );
Transducer *composition( Transducer *a1, Transducer *a2 );
Transducer *restriction( Transducer *a, Twol_Type type, Contexts *c, int );
Transducer *replace( Transducer *a, Repl_Type type, bool optional );
Transducer *replace_in_context( Transducer *a, Repl_Type type,
Contexts *c, bool optional );
Transducer *negation( Transducer *a );
Transducer *upper_level( Transducer *a );
Transducer *lower_level( Transducer *a );
Transducer *minimise( Transducer *a );
Transducer *switch_levels( Transducer *a );
Transducer *repetition( Transducer *a );
Transducer *repetition2( Transducer *a );
Transducer *optional( Transducer *a );
Transducer *make_rule( Transducer *lc, Range *r1, Twol_Type type,
Range *r2, Transducer *rc );
Transducer *freely_insert( Transducer *a, Character lc, Character uc );
Transducer *make_mapping( Ranges*, Ranges* );
Ranges *add_range( Range*, Ranges* );
Contexts *make_context( Transducer *l, Transducer *r );
Contexts *add_context( Contexts *nc, Contexts *c );
Transducer *result( Transducer*, bool );
void write_to_file( Transducer*, char *filename);
};
}
#endif
/*******************************************************************/
/* */
/* FILE lowmem.C */
/* MODULE lowmem */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for analysing data */
/* */
/*******************************************************************/
#include <stdio.h>
#include "lowmem.h"
using std::vector;
namespace SFST {
const int BUFFER_SIZE=1000;
/*******************************************************************/
/* */
/* LowMemTransducer::analyze */
/* */
/*******************************************************************/
void LowMemTransducer::analyze( const LMNode &node,
vector<Character> &input, size_t ipos,
Analysis &ca, vector<Analysis> &analyses )
{
if (node.finalp && ipos == input.size())
// store the new analysis
analyses.push_back(ca);
// follow the transitions
for( int i=0; i<node.number_of_arcs; i++ ) {
ca.push_back(node.arc[i].label);
LMNode target(node.arc[i].tnodepos, lmafile);
if (node.arc[i].label.upper_char() == Label::epsilon)
analyze(target, input, ipos, ca, analyses);
else if (ipos < input.size() &&
node.arc[i].label.upper_char() == (Character)input[ipos])
analyze(target, input, ipos+1, ca, analyses);
ca.pop_back();
}
}
/*******************************************************************/
/* */
/* LowMemTransducer::analyze_string */
/* */
/*******************************************************************/
void LowMemTransducer::analyze_string(char *string, vector<Analysis> &analyses)
{
vector<Character> input;
alphabet.string2symseq( string, input );
Analysis ca;
analyses.clear();
analyze(*rootnode, input, 0, ca, analyses);
if (simplest_only)
alphabet.disambiguate( analyses );
}
/*******************************************************************/
/* */
/* LowMemTransducer::LowMemTransducer */
/* */
/*******************************************************************/
LowMemTransducer::LowMemTransducer( FILE *file )
{
simplest_only = false;
lmafile = file;
if (fgetc(file) != 'l')
throw "Error: wrong file format (not a lowmem transducer)\n";
alphabet.read(file);
rootnode = new LMNode(ftell(file), lmafile);
}
}
/*******************************************************************/
/* */
/* FILE lowmem.h */
/* MODULE lowmem */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _LOWMEM_H_
#define _LOWMEM_H_
#include <stdio.h>
#include "alphabet.h"
#include <vector>
namespace SFST {
/***************** class LMArc ***********************************/
class LMArc {
public:
Label label;
unsigned int tnodepos;
LMArc( void ) {};
};
/***************** class LMNode **********************************/
class LMNode {
public:
bool finalp;
unsigned short number_of_arcs;
LMArc *arc;
LMNode( long pos, FILE *lmafile ) {
fseek(lmafile, pos, SEEK_SET);
fread(&finalp, sizeof(finalp), 1, lmafile);
fread(&number_of_arcs, sizeof(number_of_arcs), 1, lmafile);
arc = new LMArc[number_of_arcs];
for( int i=0; i<(int)number_of_arcs; i++ ) {
Character lc,uc;
unsigned int tpos;
fread(&lc, sizeof(lc), 1, lmafile);
fread(&uc, sizeof(uc), 1, lmafile);
fread(&tpos, sizeof(tpos), 1, lmafile);
arc[i].label = Label(lc,uc);
arc[i].tnodepos = tpos;
}
};
~LMNode() { delete[] arc; };
};
/***************** class LowMemTransducer *************************/
class LowMemTransducer {
protected:
void analyze( const LMNode&, std::vector<Character> &input, size_t ipos,
Analysis&, std::vector<Analysis>& );
public:
bool simplest_only;
FILE *lmafile;
LMNode *rootnode;
Alphabet alphabet;
LowMemTransducer( FILE* );
~LowMemTransducer() { delete rootnode; };
void analyze_string( char *string, std::vector<Analysis> &analyses );
};
}
#endif
/*******************************************************************/
/* */
/* FILE make-compact.C */
/* MODULE make-compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for generating compact automata */
/* */
/*******************************************************************/
#include <math.h>
#include "make-compact.h"
namespace SFST {
class ARC {
public:
int cv;
Label label;
unsigned int target_node;
bool operator< ( const ARC a ) const {
return cv < a.cv;
};
};
typedef hash_map<Label, size_t, Label::label_hash, Label::label_eq> LabelNumber;
/*******************************************************************/
/* */
/* MakeCompactTransducer::sort */
/* */
/*******************************************************************/
void MakeCompactTransducer::sort( Level level )
{
for( unsigned int n=0; n<number_of_nodes; n++) {
unsigned int from=first_arc[n];
unsigned int to=first_arc[n+1];
int l=to-from;
// copy the arcs to a temporary table
ARC *arc=new ARC[l];
for( unsigned int i=from; i<to; i++) {
arc[i-from].cv = (int)label[i].get_char(level);
// make sure that epsilon arcs are stored at the beginning
// even if epsilon is not 0
if (arc[i-from].cv == (int)Label::epsilon)
arc[i-from].cv = -1;
arc[i-from].label = label[i];
arc[i-from].target_node = target_node[i];
}
// sort the table
std::sort( arc, arc+l );
// copy the arcs back to the original table
for( unsigned int i=from; i<to; i++) {
label[i] = arc[i-from].label;
target_node[i] = arc[i-from].target_node;
}
delete[] arc;
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::count_arcs */
/* */
/*******************************************************************/
void MakeCompactTransducer::count_arcs( Node *node, NodeNumbering &index,
VType vmark )
{
if (!node->was_visited( vmark )) {
unsigned int n = index[node];
finalp[n] = node->is_final();
first_arc[n] = 0;
Arcs *arcs=node->arcs();
for( ArcsIter p(arcs); p; p++ ) {
Arc *arc=p;
first_arc[n]++;
count_arcs(arc->target_node(), index, vmark);
}
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_arcs */
/* */
/*******************************************************************/
void MakeCompactTransducer::store_arcs( Node *node, NodeNumbering &index,
VType vmark )
{
if (!node->was_visited( vmark )) {
unsigned int n=first_arc[index[node]];
Arcs *arcs=node->arcs();
for( ArcsIter p(arcs); p; p++ ) {
Arc *arc=p;
label[n] = arc->label();
target_node[n++] = index[arc->target_node()];
store_arcs(arc->target_node(), index, vmark);
}
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::MakeCompactTransducer */
/* */
/*******************************************************************/
MakeCompactTransducer::MakeCompactTransducer( Transducer &a, Level l )
{
if (a.is_infinitely_ambiguous()) {
std::cerr << "Error: resulting transducer contains an infinite loop!\n";
exit(1);
}
NodeNumbering index(a);
alphabet.copy(a.alphabet);
// memory allocation
number_of_nodes = (unsigned)index.number_of_nodes();
finalp = new char[number_of_nodes];
first_arc = new unsigned int[number_of_nodes+1];
// count the number of outgoing arcs for each node
// and store them in first_arc[]
a.incr_vmark();
count_arcs( a.root_node(), index, a.vmark );
for( int n=number_of_nodes; n>0; n-- )
first_arc[n] = first_arc[n-1];
first_arc[0] = 0;
for( unsigned int n=0; n<number_of_nodes; n++ )
first_arc[n+1] += first_arc[n];
number_of_arcs = first_arc[number_of_nodes];
// memory allocation
label = new Label[number_of_arcs];
target_node = new unsigned int[number_of_arcs];
// store the arcs
a.incr_vmark();
store_arcs( a.root_node(), index, a.vmark );
// sort the arcs
sort( l );
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_finalp */
/* */
/*******************************************************************/
void MakeCompactTransducer::store_finalp( FILE *file )
{
int k=0;
unsigned char n=0;
for( size_t i=0; i<number_of_nodes; i++ ) {
n = (unsigned char)(n << 1);
if (finalp[i])
n |= 1;
if (++k == 8) {
fputc(n, file);
n = 0;
k = 0;
}
}
if (k > 0) {
n = (unsigned char)(n << (8-k));
fputc(n, file);
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_first_arcs */
/* */
/* The data is encoded with the minimal number of bits needed. */
/* */
/*******************************************************************/
void MakeCompactTransducer::store_first_arcs( FILE *file )
{
int k=0;
unsigned int n=0;
// compute number of bits required for storing each item
int bits=(int)ceil(log(number_of_arcs+1)/log(2));
for( size_t i=0; i<=number_of_nodes; i++ ) {
unsigned int m=first_arc[i];
m <<= (sizeof(n)*8) - bits;
m >>= k;
n = n | m;
k += bits;
if (k >= (int)sizeof(n)*8) {
fwrite(&n, sizeof(n), 1, file);
k -= (int)sizeof(n) * 8;
n = first_arc[i];
if (k == 0)
n = 0;
else
n = first_arc[i] << (sizeof(n) * 8 - k);
}
}
if (k > 0)
fwrite(&n, sizeof(n), 1, file);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_target_nodes */
/* */
/*******************************************************************/
void MakeCompactTransducer::store_target_nodes( FILE *file )
{
int k=0;
unsigned int n=0;
int bits=(int)ceil(log(number_of_nodes)/log(2));
for( size_t i=0; i<number_of_arcs; i++ ) {
unsigned int m=target_node[i];
m <<= (sizeof(n)*8) - bits;
m >>= k;
n = n | m;
k += bits;
if (k >= (int)sizeof(n)*8) {
fwrite(&n, sizeof(n), 1, file);
k -= (int)sizeof(n)*8;
if (k == 0)
n = 0;
else
n = target_node[i] << (sizeof(n) * 8 - k);
}
}
if (k > 0)
fwrite(&n, sizeof(n), 1, file);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_labels */
/* */
/*******************************************************************/
void MakeCompactTransducer::store_labels( FILE *file )
{
size_t N=0;
LabelNumber LNum;
for( Alphabet::const_iterator it=alphabet.begin();
it != alphabet.end(); it++ )
{
Label l=*it;
LNum[l] = N++;
}
int k=0;
unsigned int n=0;
int bits=(int)ceil(log((double)alphabet.size())/log(2));
for( size_t i=0; i<number_of_arcs; i++ ) {
unsigned int l = (unsigned)LNum[label[i]];
unsigned int m=l;
m <<= (sizeof(n)*8) - bits;
m >>= k;
n = n | m;
k += bits;
if (k >= (int)sizeof(n)*8) {
fwrite(&n, sizeof(n), 1, file);
k -= (int)sizeof(n)*8;
if (k == 0)
n = 0;
else
n = l << (sizeof(n) * 8 - k);
}
}
if (k > 0)
fwrite(&n, sizeof(n), 1, file);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store */
/* */
/*******************************************************************/
void MakeCompactTransducer::store( FILE *file )
{
fputc('c',file);
alphabet.store(file);
fwrite(&number_of_nodes, sizeof(number_of_nodes), 1, file);
fwrite(&number_of_arcs, sizeof(number_of_arcs), 1, file);
store_finalp(file);
store_first_arcs(file);
store_labels(file);
store_target_nodes(file);
if (ferror(file))
throw "Error encountered while writing transducer to file\n";
}
}
/*******************************************************************/
/* */
/* FILE make-compact.h */
/* MODULE make-compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#ifndef _MAKE_COMPACT_H_
#define _MAKE_COMPACT_H_
#include "fst.h"
#include "compact.h"
namespace SFST {
class MakeCompactTransducer : CompactTransducer {
private:
void count_arcs(Node *node, NodeNumbering &index, VType vmark);
void store_arcs(Node *node, NodeNumbering &index, VType vmark);
void store_finalp( FILE *file );
void store_first_arcs( FILE *file );
void store_target_nodes( FILE *file );
void store_labels( FILE *file );
public:
MakeCompactTransducer( Transducer &a, Level sort=upper );
void sort( Level );
void store( FILE *file );
};
}
#endif
.TH fst-compact 1 "October 2004" "" "fst-compact"
.SH NAME
fst-compact \- transforms transducers to the compact format
.SH SYNOPSIS
.B fst-compact
.I [ file [ file ] ]
.SH DESCRIPTION
.I fst-compact
reads a transducer in standard format from the input and writes it in
compact format to the output. The compact format is required by
.I fst-infl2.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler, fst-infl3
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
.TH fst-compare 1 "February 2002" "" "fst-compare"
.SH NAME
fst-compare \- checks whether two transducers are equivalent
.SH SYNOPSIS
.B fst-compare
.I file1 file2
.SH DESCRIPTION
.I fst-compare
checks whether the two transducers stored in
.I file1
and
.I file2
are equivalent.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
.TH fst-compiler 1 "December 2004" "" "fst-compiler"
.SH NAME
fst-compiler fst-compiler-utf8 \- Two compilers for SFST programs
.SH SYNOPSIS
.B fst-compiler
.I grammar-file
[
.I output-file
]
.br
.B fst-compiler-utf8
.I grammar-file
[
.I output-file
]
.SH OPTIONS
.TP
.B \-c
Store the transducer in compact format which is used by fst-infl2.
.TP
.B \-l
Store the transducer in lowmem format.
.TP
.B \-s
Switch surface and analysis layer of the transducer. You have to use
this switch in order to use
.I fst-infl (fst-infl2, fst-infl3)
for generation rather than analysis.
.SH DESCRIPTION
.B fst-compiler
is a compiler for finite-state transducer programs. It generates a
minimized finite state transducer which can be used with
.I fst-mor,
.I fst-infl,
.I fst-print,
.I fst-compare,
.I fst-parse,
and
.I fst-lattice.
The compact transducer representation which is generated with the -c
flag, is supported by
.I fst-infl2,
.I fst-train,
and
.I fst-match.
The memory-efficient transducer representation which is generated with
the -l flag, is only supported by
.I fst-infl3.
.PP
The first program argument is the name of a file which contains the
transducer program. The programming language is described below. The
second argument is the name of the file to which the resulting
transducer will be written in binary form. If a second argument is
missing, the output will be written to
.I stdout.
.PP
.I fst-compiler-utf8
differs from
.I fst-compiler
only in the character encoding.
.I fst-compiler-utf8
supports UTF8 encoding of the source files whereas
.I fst-compiler
is to be used for 8-Bit character codes like latin1 which are an
extension of the ASCII code. Information about the encoding is stored
in the transducer files and used by the other SFST programs.
.SH "FILE FORMATS"
A transducer program consists of an (optional) sequence of
.I alphabet
and
.I variable
definitions followed by a single
.I transducer expression
which defines the result transducer.
.PP
.SM Alphabet
.PP
An alphabet definition consists of the keyword ALPHABET followed by
= and some transducer expression e.g.
.TP
ALPHABET = [a-z]:[A-Z]
.PP
This command redefines the alphabet as the set of symbol pairs
occurring on the transitions of the transducer. Occurrences of
two-level operators, negation operators and unquoted periods always
have to be preceded by an alphabet definition.
.PP
.SM Variables
.PP
There are two different types of variables.
.I Symbol set variables
are enclosed by hash signs (#) and take symbol sequences (see below)
as values:
.TP 0
#UC# = A-Z
#LC# = a-z
.PP
.I Transducer variables
are enclosed by dollar signs and take transducer expressions as
values:
.TP 0
$MAP$ = [a-z]:[A-Z]+
$MAP$ = [#LC#]:[#UC#]+
.PP
Variables whose name starts with the symbol `=' are special
.I agreement
variables. If an agreement variable occurs more than once in a
transducer expression, it will always have the same value. Consider
the following transducer program:
.TP 0
$=1$ = [abc]
$=1$ X $=1$
.PP
The result transducer recognizes the strings aXa, bXb, and cXc. Only
acyclic transducers (i.e. transducers with a finite set of string
mappings) can be assigned to agreement variables.
.PP
.SM Symbols
.PP
A symbol is either
.PP
- a single character like A s 5,
.PP
- a quoted character like \\* or \\_,
.TP 2
- a multi-character symbol like <X> or <ab.c5> (which is always
enclosed in angle brackets) or
.TP
- a backslash followed by a number which is the numeric code of the
designated character
.PP
- the null symbol <>.
.PP
.SM Symbol sequence
.PP
A symbol sequence is a sequence of characters, multi-character symbols
and character ranges, e.g. a-z \\. <x>.
.PP
.SM symbol range
.PP
A symbol range is either
.PP
- a single symbol
.PP
- a symbol sequence enclosed in square brackets like [A-Za-z] or
.PP
- a symbol sequence starting with ^ and enclosed in square brackets
like [^A-Za-z] (designating the complement of [a-zA-Z]) or
.PP
- the period (which represents any symbol from the alphabet)
.PP
.SM Transducer expressions
.PP
A transducer expression (TE) is recursively defined as follows:
.TP 2
- A pair of two symbol ranges separated by a colon is a TE.
[a-z]:[a-Z]
.TP 1
- A single symbol range like [a-z] is a TE.
.BR
It is a short form for [a-z]:[a-z].
.TP 1
- Two symbol sequences enclosed in braces and separated by a colon are
a TE. {a[bc]}:{def} is equivalent to a:d b:e <>:f | a:d c:e <>:f.
.TP 1
- X Y is a TE if X and Y are TEs.
.BR
(Blanks are ignored unless they are quoted.)
.TP 1
- (X) is a TE if X is a TE.
.TP 1
- X op is a TE is X is a TE and op is either * (Kleene's star operator), +
(Kleene's plus operator), or ? (optionality operator)
.TP 1
- op X is a TE is X is a TE and op is either ! (negation operator), ^
(target language extraction operator), _ (source language extraction
operator), or ^_ (source and target switch operator).
.TP 1
- X op Y is a TE is X and Y are TEs and op is either & (conjunction
operator), | (disjunction operator), || (composition operator), or -
(subtraction operator)
.TP 1
- L x op y R is a TE if L and R are TEs, x and y are symbol ranges and
op is either => (two-level restriction), <= (two-level coercion), or
<=> (two-level restriction and coercion).
.TP 1
- X op L__R is a TE if X, L and R are TEs and op is either ^-> (upward
replacement), _-> (downward replacement), /-> (leftward replacement)
or \\-> (rightward replacement). Furthermore, L and R must define
automata (i.e. which map their strings onto themselves). These
operators correspond to Karttunen's replace operators. If the arrow is
followed by a question mark (?), the replacement becomes optional.
.TP 1
- X << l is a TE if X is a TE, and l is either of the form
a or the form a:b where a and b are single characters or symbols. The
result is a transducer where l was freely inserted into X. The
transducer ab << c for instance is equivalent to c*ac*bc*.
.TP 1
- X op Y L1__R2, ... , LN__RN is a TE if X,Y, L1 through LN and R1
through RN are TEs, and op is either => (general restriction), <=
(general coercion), ^=> (general surface restriction), ^<= (general
surface coercion), ^<=> (general surface restriction and coercion),
_=> (general deep restriction), _<= (general deep coercion), _<=>
(general deep restriction and coercion). (These operators were
implemented following a suggestion by Anssi Yli-Jyra.)
.TP 1
- "fname" is a TE. The compiler reads the file named fname and turns
it into a transducer of the form line1|line2|line3|... where linex is
the x-th line of the file. All characters other than : and \\ are
interpreted literally (i.e. not as operators). This TE is typically
used e.g. to read morpheme list from a file.
.TP 1
- "<fname>" is a TE. The compiler reads a pre-compiled transducer from
the file named fname. This
.PP
Further Features
.PP
Comments start with the symbol % and extend up to the end of the line.
Blanks are ignored unless they are quoted. Expressions terminate at
the end of a line unless the end of line is preceded by a backslash.
The command
.TP
#include "fname"
.PP
can be used to insert source code from a
file named fname.
The command
.TP
RE >> "fname"
.PP
stores the regular expression RE in the file fname.
The command
.TP
#use hopcroft
.PP
tells the compiler to use the Hopcroft minimisation algorithm from now on, and
.TP
#use default
.PP
switsches back to the default minimisation algorithm (Brzozowski).
The command
.SH EXAMPLE
Here is an example of a simple transducer program. Assuming that
the file "adj-stems" contains the two lines
.PP
.ti +3
easy
.ti +3
late
.ti +3
big
.PP
this transducer will correctly analyze the adjective forms easy,
easier, easiest and late, later, and latest.
.PP
ALPHABET = [a-zA-Z] y:i e:<> <ADJ>:<>
$R$ = y<=>i (<ADJ>:<> e)
$R2$ = e<=><> (<ADJ>:<> e)
$R$ = $R$ & $R2$
$Stems$ = "adj-stems"
$S$ = $Stems$ <ADJ> (<pos>:<>|<cmp>:{er}|<sup>:{est})
$S$ || $R$
.SH "EXIT STATUS"
.B fst-compiler
returns 0 unless some error occurs.
.\" .SH FILES
.SH BUGS
The compiler gets the operator precedence wrong in case of two-level
rules and interprets the expression "ab c<=>d ef" as "a(b c<=>d
(ef))". Therefore, you should always surround the left context of
two-level rules with parenthesis: (ab) c<=>d (ef)
.SH "SEE ALSO"
fst-mor, fst-infl, fst-infl2, fst-infl3, fst-print, fst-compact,
fst-parse, fst-compare, fst-compact, fst-lowmem, fst-lattice, fst-train
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
.TH fst-compiler 1 "December 2004" "" "fst-compiler"
.SH NAME
fst-compiler fst-compiler-utf8 \- Two compilers for SFST programs
.SH SYNOPSIS
.B fst-compiler
.I grammar-file
[
.I output-file
]
.br
.B fst-compiler-utf8
.I grammar-file
[
.I output-file
]
.SH OPTIONS
.TP
.B \-c
Store the transducer in compact format which is used by fst-infl2.
.TP
.B \-l
Store the transducer in lowmem format.
.TP
.B \-s
Switch surface and analysis layer of the transducer. You have to use
this switch in order to use
.I fst-infl (fst-infl2, fst-infl3)
for generation rather than analysis.
.SH DESCRIPTION
.B fst-compiler
is a compiler for finite-state transducer programs. It generates a
minimized finite state transducer which can be used with
.I fst-mor,
.I fst-infl,
.I fst-print,
.I fst-compare,
.I fst-parse,
and
.I fst-lattice.
The compact transducer representation which is generated with the -c
flag, is supported by
.I fst-infl2,
.I fst-train,
and
.I fst-match.
The memory-efficient transducer representation which is generated with
the -l flag, is only supported by
.I fst-infl3.
.PP
The first program argument is the name of a file which contains the
transducer program. The programming language is described below. The
second argument is the name of the file to which the resulting
transducer will be written in binary form. If a second argument is
missing, the output will be written to
.I stdout.
.PP
.I fst-compiler-utf8
differs from
.I fst-compiler
only in the character encoding.
.I fst-compiler-utf8
supports UTF8 encoding of the source files whereas
.I fst-compiler
is to be used for 8-Bit character codes like latin1 which are an
extension of the ASCII code. Information about the encoding is stored
in the transducer files and used by the other SFST programs.
.SH "FILE FORMATS"
A transducer program consists of an (optional) sequence of
.I alphabet
and
.I variable
definitions followed by a single
.I transducer expression
which defines the result transducer.
.PP
.SM Alphabet
.PP
An alphabet definition consists of the keyword ALPHABET followed by
= and some transducer expression e.g.
.TP
ALPHABET = [a-z]:[A-Z]
.PP
This command redefines the alphabet as the set of symbol pairs
occurring on the transitions of the transducer. Occurrences of
two-level operators, negation operators and unquoted periods always
have to be preceded by an alphabet definition.
.PP
.SM Variables
.PP
There are two different types of variables.
.I Symbol set variables
are enclosed by hash signs (#) and take symbol sequences (see below)
as values:
.TP 0
#UC# = A-Z
#LC# = a-z
.PP
.I Transducer variables
are enclosed by dollar signs and take transducer expressions as
values:
.TP 0
$MAP$ = [a-z]:[A-Z]+
$MAP$ = [#LC#]:[#UC#]+
.PP
Variables whose name starts with the symbol `=' are special
.I agreement
variables. If an agreement variable occurs more than once in a
transducer expression, it will always have the same value. Consider
the following transducer program:
.TP 0
$=1$ = [abc]
$=1$ X $=1$
.PP
The result transducer recognizes the strings aXa, bXb, and cXc. Only
acyclic transducers (i.e. transducers with a finite set of string
mappings) can be assigned to agreement variables.
.PP
.SM Symbols
.PP
A symbol is either
.PP
- a single character like A s 5,
.PP
- a quoted character like \\* or \\_,
.TP 2
- a multi-character symbol like <X> or <ab.c5> (which is always
enclosed in angle brackets) or
.TP
- a backslash followed by a number which is the numeric code of the
designated character
.PP
- the null symbol <>.
.PP
.SM Symbol sequence
.PP
A symbol sequence is a sequence of characters, multi-character symbols
and character ranges, e.g. a-z \\. <x>.
.PP
.SM symbol range
.PP
A symbol range is either
.PP
- a single symbol
.PP
- a symbol sequence enclosed in square brackets like [A-Za-z] or
.PP
- a symbol sequence starting with ^ and enclosed in square brackets
like [^A-Za-z] (designating the complement of [a-zA-Z]) or
.PP
- the period (which represents any symbol from the alphabet)
.PP
.SM Transducer expressions
.PP
A transducer expression (TE) is recursively defined as follows:
.TP 2
- A pair of two symbol ranges separated by a colon is a TE.
[a-z]:[a-Z]
.TP 1
- A single symbol range like [a-z] is a TE.
.BR
It is a short form for [a-z]:[a-z].
.TP 1
- Two symbol sequences enclosed in braces and separated by a colon are
a TE. {a[bc]}:{def} is equivalent to a:d b:e <>:f | a:d c:e <>:f.
.TP 1
- X Y is a TE if X and Y are TEs.
.BR
(Blanks are ignored unless they are quoted.)
.TP 1
- (X) is a TE if X is a TE.
.TP 1
- X op is a TE is X is a TE and op is either * (Kleene's star operator), +
(Kleene's plus operator), or ? (optionality operator)
.TP 1
- op X is a TE is X is a TE and op is either ! (negation operator), ^
(target language extraction operator), _ (source language extraction
operator), or ^_ (source and target switch operator).
.TP 1
- X op Y is a TE is X and Y are TEs and op is either & (conjunction
operator), | (disjunction operator), || (composition operator), or -
(subtraction operator)
.TP 1
- L x op y R is a TE if L and R are TEs, x and y are symbol ranges and
op is either => (two-level restriction), <= (two-level coercion), or
<=> (two-level restriction and coercion).
.TP 1
- X op L__R is a TE if X, L and R are TEs and op is either ^-> (upward
replacement), _-> (downward replacement), /-> (leftward replacement)
or \\-> (rightward replacement). Furthermore, L and R must define
automata (i.e. which map their strings onto themselves). These
operators correspond to Karttunen's replace operators. If the arrow is
followed by a question mark (?), the replacement becomes optional.
.TP 1
- X << l is a TE if X is a TE, and l is either of the form
a or the form a:b where a and b are single characters or symbols. The
result is a transducer where l was freely inserted into X. The
transducer ab << c for instance is equivalent to c*ac*bc*.
.TP 1
- X op Y L1__R2, ... , LN__RN is a TE if X,Y, L1 through LN and R1
through RN are TEs, and op is either => (general restriction), <=
(general coercion), ^=> (general surface restriction), ^<= (general
surface coercion), ^<=> (general surface restriction and coercion),
_=> (general deep restriction), _<= (general deep coercion), _<=>
(general deep restriction and coercion). (These operators were
implemented following a suggestion by Anssi Yli-Jyra.)
.TP 1
- "fname" is a TE. The compiler reads the file named fname and turns
it into a transducer of the form line1|line2|line3|... where linex is
the x-th line of the file. All characters other than : and \\ are
interpreted literally (i.e. not as operators). This TE is typically
used e.g. to read morpheme list from a file.
.TP 1
- "<fname>" is a TE. The compiler reads a pre-compiled transducer from
the file named fname. This
.PP
Further Features
.PP
Comments start with the symbol % and extend up to the end of the line.
Blanks are ignored unless they are quoted. Expressions terminate at
the end of a line unless the end of line is preceded by a backslash.
The command
.TP
#include "fname"
.PP
can be used to insert source code from a
file named fname.
The command
.TP
RE >> "fname"
.PP
stores the regular expression RE in the file fname.
The command
.TP
#use hopcroft
.PP
tells the compiler to use the Hopcroft minimisation algorithm from now on, and
.TP
#use default
.PP
switsches back to the default minimisation algorithm (Brzozowski).
The command
.SH EXAMPLE
Here is an example of a simple transducer program. Assuming that
the file "adj-stems" contains the two lines
.PP
.ti +3
easy
.ti +3
late
.ti +3
big
.PP
this transducer will correctly analyze the adjective forms easy,
easier, easiest and late, later, and latest.
.PP
ALPHABET = [a-zA-Z] y:i e:<> <ADJ>:<>
$R$ = y<=>i (<ADJ>:<> e)
$R2$ = e<=><> (<ADJ>:<> e)
$R$ = $R$ & $R2$
$Stems$ = "adj-stems"
$S$ = $Stems$ <ADJ> (<pos>:<>|<cmp>:{er}|<sup>:{est})
$S$ || $R$
.SH "EXIT STATUS"
.B fst-compiler
returns 0 unless some error occurs.
.\" .SH FILES
.SH BUGS
The compiler gets the operator precedence wrong in case of two-level
rules and interprets the expression "ab c<=>d ef" as "a(b c<=>d
(ef))". Therefore, you should always surround the left context of
two-level rules with parenthesis: (ab) c<=>d (ef)
.SH "SEE ALSO"
fst-mor, fst-infl, fst-infl2, fst-infl3, fst-print, fst-compact,
fst-parse, fst-compare, fst-compact, fst-lowmem, fst-lattice, fst-train
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
.TH fst-generate 1 "October 2003" "" "fst-generate"
.SH NAME
fst-generate \- randomly generate strings of character pairs from a transducer
.SH SYNOPSIS
.B fst-generate
.I file1 [ file [ file ] ]
.SH OPTIONS
.TP
.B \-s
print the surface characters only
.TP
.B \-a
print the analysis characters only
.TP
.B \-h
print usage information.
.TP
.B \-q
quiet mode
.TP
.B \-d
debugging mode
.SH DESCRIPTION
.I fst-generate
randomly generate sequences of character pairs which are accepted by
the transducer.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-parse
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
.TH fst-infl 1 "November 2004" "" "fst-infl"
.SH NAME
fst-infl fst-infl2 fst-infl3 \- morphological analysers
.SH SYNOPSIS
.B fst-infl [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.br
.B fst-infl2 [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.br
.B fst-infl3 [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.SH OPTIONS
.TP
.B \-t file
Read an alternative transducer from
.I file
and use it if the main transducer fails to find an analysis. By
iterating this option, a cascade of transducers may be tried to find
an analysis.
.TP
.B \-b
Print surface and analysis symbols. (fst-infl2 only)
.TP
.B \-n
Print multi-character symbols without the enclosing angle brackets.
(fst-infl only)
.TP
.B \-d
The analyses are symbolically disambiguated by returning only analyses
with a minimal number of morphemes. This option requires that morpheme
boundaries are marked with the tag <X>. If no <X> tag is found in the
analysis string, then the program (basically) counts the number of
multi-character symbols consisting entirely of upper-case characters
and uses this count for disambiguation. The latter heuristic was
developed for the German SMOR morphology. (This option is only
available with fst-infl2 and fst-infl3.)
.TP
.B \-e n
If no regular analysis is found, do robust matching and print analyses
with up to
.I n
edit errors. The set of edit operations currently includes
replacement, insertion and deletion. Each operation has currently a
fixed error weight of 1. (fst-infl2 only)
.TP
.B \-% f
Disambiguates the analyses statistically and prints the most likely
analyses with at least f % of the total probability mass of the
analyses. The transducer weights are read from a file obtained by
appending
.I .prob
to the name of the transducer file. The weight files are created with
.I fst-train.
(fst-infl2 only)
.TP
.B \-p
Print the probability of each analysis. (fst-infl2 only)
.TP
.B \-c
use this option if the transducer was compiled on a computer with a
different endianness. If you have a transducer which was compiled
on a Sparc computer and you want to use it on a Pentium, you need to
use this option. (fst-infl2 only)
.TP
.B \-q
Suppress status messages.
.TP
.B \-h
Print usage information.
.SH DESCRIPTION
.I fst-infl
is a morphological analyser. The first argument is the name of a file
which was generated by
.I fst-compiler.
The second argument is the name of the input file. The third argument
is the output file. If the third argument is missing, output is
directed to
.I stdout.
If the second argument is missing, as well, input is read from
.I stdin.
.I fst-infl2
is similar to
.I fst-infl
but needs a transducer in compact format (see the man pages for
.I fst-compiler and fst-compact). fst-infl2 is implemented differently
from fst-infl and usually much faster.
.I fst-infl3
is also similar to
.I fst-infl
but needs a transducer in lowmem format (see the man pages for
.I fst-compiler and fst-lowmem). fst-infl3 accesses the transducer on
disc rather than reading it into memory. It starts very fast and needs
very little memory, but is slower than fst-infl2.
.I fst-infl
reads the transducer which is stored in the argument file. Then it
reads the input file line by line. Each line is analysed with the
transducer and all resulting analyses are printed (see also the man
pages for
.I fst-mor).
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler, fst-mor
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment