Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
germanet
germanet-decompounder
Commits
0b611765
Commit
0b611765
authored
Jan 23, 2019
by
Ben Campbell
Browse files
Initial commit
parents
Changes
156
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3944 additions
and
0 deletions
+3944
-0
src/main/resources/SMOR/src/fst.o
src/main/resources/SMOR/src/fst.o
+0
-0
src/main/resources/SMOR/src/generate.C
src/main/resources/SMOR/src/generate.C
+104
-0
src/main/resources/SMOR/src/generate.o
src/main/resources/SMOR/src/generate.o
+0
-0
src/main/resources/SMOR/src/hopcroft.C
src/main/resources/SMOR/src/hopcroft.C
+529
-0
src/main/resources/SMOR/src/hopcroft.o
src/main/resources/SMOR/src/hopcroft.o
+0
-0
src/main/resources/SMOR/src/interface.C
src/main/resources/SMOR/src/interface.C
+1811
-0
src/main/resources/SMOR/src/interface.h
src/main/resources/SMOR/src/interface.h
+165
-0
src/main/resources/SMOR/src/interface.o
src/main/resources/SMOR/src/interface.o
+0
-0
src/main/resources/SMOR/src/lowmem.C
src/main/resources/SMOR/src/lowmem.C
+93
-0
src/main/resources/SMOR/src/lowmem.h
src/main/resources/SMOR/src/lowmem.h
+84
-0
src/main/resources/SMOR/src/lowmem.o
src/main/resources/SMOR/src/lowmem.o
+0
-0
src/main/resources/SMOR/src/make-compact.C
src/main/resources/SMOR/src/make-compact.C
+327
-0
src/main/resources/SMOR/src/make-compact.h
src/main/resources/SMOR/src/make-compact.h
+36
-0
src/main/resources/SMOR/src/make-compact.o
src/main/resources/SMOR/src/make-compact.o
+0
-0
src/main/resources/SMOR/src/man1/fst-compact.1
src/main/resources/SMOR/src/man1/fst-compact.1
+22
-0
src/main/resources/SMOR/src/man1/fst-compare.1
src/main/resources/SMOR/src/man1/fst-compare.1
+24
-0
src/main/resources/SMOR/src/man1/fst-compiler-utf8.1
src/main/resources/SMOR/src/man1/fst-compiler-utf8.1
+290
-0
src/main/resources/SMOR/src/man1/fst-compiler.1
src/main/resources/SMOR/src/man1/fst-compiler.1
+290
-0
src/main/resources/SMOR/src/man1/fst-generate.1
src/main/resources/SMOR/src/man1/fst-generate.1
+37
-0
src/main/resources/SMOR/src/man1/fst-infl.1
src/main/resources/SMOR/src/man1/fst-infl.1
+132
-0
No files found.
src/main/resources/SMOR/src/fst.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/generate.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE generate.C */
/* MODULE generate */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE prints automata */
/* */
/*******************************************************************/
#include "fst.h"
using
std
::
vector
;
namespace
SFST
{
const
int
BUFFER_SIZE
=
10000
;
/*******************************************************************/
/* */
/* Transducer::generate1 */
/* */
/*******************************************************************/
bool
Transducer
::
generate1
(
Node
*
node
,
Node2Int
&
visitations
,
char
*
buffer
,
int
pos
,
char
*
buffer2
,
int
pos2
,
FILE
*
file
)
{
bool
result
=
false
;
if
(
pos
>
BUFFER_SIZE
-
100
||
pos2
>
BUFFER_SIZE
-
100
)
return
result
;
if
(
node
->
is_final
())
{
buffer
[
pos
]
=
'\n'
;
buffer
[
pos
+
1
]
=
0
;
if
(
buffer2
)
{
buffer2
[
pos2
]
=
'\t'
;
buffer2
[
pos2
+
1
]
=
0
;
fputs
(
buffer2
,
file
);
}
fputs
(
buffer
,
file
);
result
=
true
;
}
visitations
[
node
]
++
;
// sort arcs by number of visitations
vector
<
Arc
*>
arc
;
for
(
ArcsIter
p
(
node
->
arcs
());
p
;
p
++
)
{
Arc
*
a
=
p
;
Node
*
n
=
a
->
target_node
();
size_t
i
;
for
(
i
=
0
;
i
<
arc
.
size
();
i
++
)
if
(
visitations
[
n
]
<
visitations
[
arc
[
i
]
->
target_node
()])
break
;
arc
.
push_back
(
NULL
);
for
(
size_t
k
=
arc
.
size
()
-
1
;
k
>
i
;
k
--
)
arc
[
k
]
=
arc
[
k
-
1
];
arc
[
i
]
=
a
;
}
for
(
size_t
i
=
0
;
i
<
arc
.
size
();
i
++
)
{
int
p
=
pos
;
int
p2
=
pos2
;
Label
l
=
arc
[
i
]
->
label
();
if
(
buffer2
)
{
Character
lc
=
l
.
lower_char
();
Character
uc
=
l
.
upper_char
();
if
(
lc
!=
Label
::
epsilon
)
alphabet
.
write_char
(
lc
,
buffer
,
&
p
);
if
(
uc
!=
Label
::
epsilon
)
alphabet
.
write_char
(
uc
,
buffer2
,
&
p2
);
}
else
alphabet
.
write_label
(
l
,
buffer
,
&
p
);
result
|=
generate1
(
arc
[
i
]
->
target_node
(),
visitations
,
buffer
,
p
,
buffer2
,
p2
,
file
);
}
return
result
;
}
/*******************************************************************/
/* */
/* Transducer::generate */
/* */
/*******************************************************************/
bool
Transducer
::
generate
(
FILE
*
file
,
bool
separate
)
{
bool
result
;
char
buffer
[
BUFFER_SIZE
];
Node2Int
visitations
;
if
(
separate
)
{
char
buffer2
[
BUFFER_SIZE
];
result
=
generate1
(
root_node
(),
visitations
,
buffer
,
0
,
buffer2
,
0
,
file
);
}
else
result
=
generate1
(
root_node
(),
visitations
,
buffer
,
0
,
NULL
,
0
,
file
);
return
result
;
}
}
src/main/resources/SMOR/src/generate.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/hopcroft.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE hopcroft.C */
/* MODULE hopcroft */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#include "fst.h"
// #define TEST1
// HFST
namespace
SFST
{
/***************** class Transition ******************************/
class
Transition
{
// the representation of a transition used here
public:
size_t
source
;
Label
label
;
size_t
target
;
Transition
(
size_t
s
,
Label
l
,
size_t
t
)
:
source
(
s
),
label
(
l
),
target
(
t
)
{};
bool
operator
==
(
const
Transition
&
t
)
const
{
return
t
.
source
==
source
&&
t
.
label
==
label
&&
t
.
target
==
target
;
}
bool
operator
<
(
const
Transition
&
t
)
const
{
if
(
target
<
t
.
target
)
return
true
;
if
(
target
>
t
.
target
)
return
false
;
if
(
label
<
t
.
label
)
return
true
;
if
(
label
>
t
.
label
)
return
false
;
if
(
source
<
t
.
source
)
return
true
;
return
false
;
}
};
/***************** class TransitionTable *************************/
// TransitionTable quickly retrieves the transitions to a given state
class
TransitionTable
{
// transducer representation used here
public:
NodeNumbering
nodenumbering
;
// maps node pointers to IDs and vice versa
vector
<
Transition
>
transition
;
vector
<
size_t
>
startpos
;
// position of the first transition
// with a given target state
TransitionTable
(
Transducer
&
t
)
:
nodenumbering
(
t
)
{
t
.
build_transtab
(
transition
,
nodenumbering
);
std
::
sort
(
transition
.
begin
(),
transition
.
end
()
);
size_t
previous_state
=
0
;
// build the startpos array
startpos
.
push_back
(
0
);
for
(
size_t
i
=
0
;
i
<
transition
.
size
();
i
++
)
if
(
transition
[
i
].
target
!=
previous_state
)
while
(
startpos
.
size
()
<=
transition
[
i
].
target
)
startpos
.
push_back
(
i
);
startpos
.
push_back
(
transition
.
size
()
);
}
};
/************* class PosRange **********************************/
class
PosRange
{
// position of the first and last+1 element of a state set
public:
size_t
from
,
to
;
PosRange
(
size_t
f
,
size_t
t
)
:
from
(
f
),
to
(
t
)
{};
size_t
size
()
{
return
to
-
from
;
}
};
/************* class Stack *************************************/
class
Stack
{
// stores the IDs of the state sets on the to-do list
private:
vector
<
size_t
>
setID
;
// list of stored IDs needed for LIFO retrieval
set
<
size_t
>
store
;
// data structure needed for quick lookup of an item
public:
void
push
(
size_t
id
)
{
if
(
store
.
find
(
id
)
==
store
.
end
())
{
setID
.
push_back
(
id
);
store
.
insert
(
id
);
}
}
bool
is_empty
()
{
return
setID
.
size
()
==
0
;
}
size_t
pop
(
void
)
{
size_t
result
=
setID
.
back
();
setID
.
pop_back
();
store
.
erase
(
result
);
return
result
;
}
bool
contains
(
size_t
id
)
{
return
store
.
find
(
id
)
!=
store
.
end
();
}
};
/************* class Minimiser *********************************/
// table which maps labels to sets of source states
typedef
map
<
Label
,
set
<
size_t
>
>
Label2SState
;
class
Minimiser
{
// main class of the minimisation algorithm
Transducer
&
transducer
;
// reference to the original transducer
TransitionTable
transtab
;
// internal transducer representation
vector
<
size_t
>
partition
;
// state partition table
// states from the same set occur next to each other in the table
vector
<
PosRange
>
posrange
;
// start+end positions of sets in "partition"
vector
<
size_t
>
state2set
;
// maps states to state sets
Stack
stack
;
// stack of split states (to-do list)
void
print_partition
(
FILE
*
file
)
{
for
(
size_t
i
=
0
;
i
<
posrange
.
size
();
i
++
)
{
fprintf
(
file
,
"set[%lu] ="
,
(
unsigned
long
)
i
);
for
(
size_t
k
=
posrange
[
i
].
from
;
k
<
posrange
[
i
].
to
;
k
++
)
fprintf
(
file
,
" %lu"
,
(
unsigned
long
)
partition
[
k
]);
fputc
(
'\n'
,
file
);
}
}
public:
Minimiser
(
Transducer
&
t
);
// initialises the minimiser
Transducer
&
result
();
// does the minimisation and returns the result
void
compute_source_states
(
Label2SState
&
,
PosRange
);
// computes for each
// label l the set of source states with an "l" transition to a state
// in the current split state set
void
split
(
size_t
B
,
vector
<
size_t
>
&
T1
,
size_t
C
);
// splits a state set
// by the state set given in "T1".
};
/*******************************************************************/
/* */
/* Minimiser::Minimiser */
/* */
/*******************************************************************/
Minimiser
::
Minimiser
(
Transducer
&
t
)
:
transducer
(
t
),
transtab
(
t
)
{
// compute the set of final (B1) and non-final (B2) states
vector
<
size_t
>
B1
,
B2
;
for
(
size_t
i
=
0
;
i
<
transtab
.
nodenumbering
.
number_of_nodes
();
i
++
)
if
(
transtab
.
nodenumbering
.
get_node
(
i
)
->
is_final
())
B1
.
push_back
(
i
);
else
B2
.
push_back
(
i
);
// build the initial partition of states
state2set
.
resize
(
transtab
.
nodenumbering
.
number_of_nodes
()
);
size_t
n
=
0
;
if
(
B1
.
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
B1
.
size
();
i
++
)
{
partition
.
push_back
(
B1
[
i
]);
state2set
[
B1
[
i
]]
=
n
;
}
posrange
.
push_back
(
PosRange
(
0
,
partition
.
size
()
)
);
n
++
;
}
size_t
pos
=
partition
.
size
();
if
(
B2
.
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
B2
.
size
();
i
++
)
{
partition
.
push_back
(
B2
[
i
]);
state2set
[
B2
[
i
]]
=
n
;
}
posrange
.
push_back
(
PosRange
(
pos
,
partition
.
size
()
)
);
}
}
/*******************************************************************/
/* */
/* Minimiser::compute_source_states */
/* */
/* compute the backward transitions from states in the split set */
/* */
/*******************************************************************/
void
Minimiser
::
compute_source_states
(
Label2SState
&
l2ss
,
PosRange
r
)
{
// This function computes for each label l the set of source states
// with a transition to one of the states in the split set which is
// labelled with l
// for all states in B
for
(
size_t
i
=
r
.
from
;
i
<
r
.
to
;
i
++
)
{
size_t
s
=
partition
[
i
];
// for all transition into state B
for
(
size_t
k
=
transtab
.
startpos
[
s
];
k
<
transtab
.
startpos
[
s
+
1
];
k
++
)
{
Transition
&
t
=
transtab
.
transition
[
k
];
// most expensive code line follows here !!!
l2ss
[
t
.
label
].
insert
(
t
.
source
);
}
}
}
/*******************************************************************/
/* */
/* Minimiser::split */
/* */
/*******************************************************************/
void
Minimiser
::
split
(
size_t
B
,
vector
<
size_t
>
&
T1
,
size_t
C
)
{
// splits a state set by intersection with the state set "T1"
// get the position range of the states in the "partition" table
PosRange
r
=
posrange
[
B
];
if
(
r
.
size
()
==
1
)
return
;
vector
<
size_t
>
B1_set
,
B2_set
;
#ifdef TEST
fprintf
(
stderr
,
"
\n
split %lu with"
,
B
);
for
(
size_t
i
=
0
;
i
<
T1
.
size
();
i
++
)
fprintf
(
stderr
,
" %lu"
,
T1
[
i
]);
fputc
(
'\n'
,
stderr
);
#endif
// compute the intersection and the difference of the two sets
size_t
k
=
0
;
size_t
i
=
r
.
from
;
while
(
i
<
r
.
to
)
{
if
(
k
==
T1
.
size
()
||
partition
[
i
]
<
T1
[
k
])
// state is not in the intersection
B2_set
.
push_back
(
partition
[
i
++
]);
else
if
(
partition
[
i
]
==
T1
[
k
])
{
// state is in the intersection
B1_set
.
push_back
(
partition
[
i
++
]);
k
++
;
}
else
k
++
;
}
#ifdef TEST
fprintf
(
stderr
,
"B1 = "
);
for
(
size_t
i
=
0
;
i
<
B1_set
.
size
();
i
++
)
fprintf
(
stderr
,
" %lu"
,
B1_set
[
i
]);
fputc
(
'\n'
,
stderr
);
fprintf
(
stderr
,
"B2 = "
);
for
(
size_t
i
=
0
;
i
<
B2_set
.
size
();
i
++
)
fprintf
(
stderr
,
" %lu"
,
B2_set
[
i
]);
fputc
(
'\n'
,
stderr
);
#endif
if
(
B2_set
.
size
()
==
0
)
return
;
// split the set in "partition"
// copy the first subset to partition
k
=
r
.
from
;
for
(
size_t
i
=
0
;
i
<
B1_set
.
size
();
i
++
)
partition
[
k
++
]
=
B1_set
[
i
];
// update the end position of the reduced state set
posrange
[
B
].
to
=
k
;
// copy the second subset to partition
size_t
new_B
=
posrange
.
size
();
posrange
.
push_back
(
PosRange
(
k
,
k
+
B2_set
.
size
())
);
for
(
size_t
i
=
0
;
i
<
B2_set
.
size
();
i
++
)
{
state2set
[
B2_set
[
i
]]
=
new_B
;
partition
[
k
++
]
=
B2_set
[
i
];
}
// add the new set to the set of split states
if
(
B
==
C
)
{
stack
.
push
(
B
);
stack
.
push
(
new_B
);
}
else
if
(
stack
.
contains
(
B
))
stack
.
push
(
new_B
);
else
if
(
B1_set
.
size
()
>
B2_set
.
size
())
stack
.
push
(
new_B
);
else
stack
.
push
(
B
);
#ifdef TEST
fprintf
(
stderr
,
"
\n
new partition:
\n
"
);
print_partition
(
stderr
);
#endif
return
;
}
/*******************************************************************/
/* */
/* Minimiser::result */
/* */
/*******************************************************************/
Transducer
&
Minimiser
::
result
()
{
if
(
transtab
.
nodenumbering
.
number_of_nodes
()
==
1
)
return
transducer
.
copy
();
// no need for a minimisation
// initialise the stack of split sets
if
(
posrange
.
size
()
==
2
)
{
// there are both, final and non-final states
// put the larger one first on the stack
if
(
posrange
[
0
].
to
-
posrange
[
0
].
from
>
posrange
[
1
].
to
-
posrange
[
1
].
from
)
{
stack
.
push
(
0
);
stack
.
push
(
1
);
}
else
{
stack
.
push
(
1
);
stack
.
push
(
0
);
}
}
else
stack
.
push
(
0
);
#ifdef TEST
fprintf
(
stderr
,
"
\n
Partition:
\n
"
);
print_partition
(
stderr
);
#endif
// repeat until the stack is empty
while
(
!
stack
.
is_empty
())
{
size_t
C
=
stack
.
pop
();
// next state set on which the others are split
// compute for each label l the set of source states with a transition
// to one of the states in the split set which is labelled with l
Label2SState
l2ss
;
compute_source_states
(
l2ss
,
posrange
[
C
]
);
// loop over the labels
for
(
Label2SState
::
iterator
it
=
l2ss
.
begin
();
it
!=
l2ss
.
end
();
it
++
)
{
set
<
size_t
>
&
X
=
it
->
second
;
// copy the set of source states to a sorted vector
vector
<
size_t
>
T1
;
T1
.
reserve
(
X
.
size
());
for
(
set
<
size_t
>::
iterator
it
=
X
.
begin
();
it
!=
X
.
end
();
it
++
)
T1
.
push_back
(
*
it
);
// find the relevant source state sets
map
<
size_t
,
size_t
>
source_set_counts
;
for
(
size_t
i
=
0
;
i
<
T1
.
size
();
i
++
)
{
size_t
ss
=
state2set
[
T1
[
i
]];
map
<
size_t
,
size_t
>::
iterator
it
=
source_set_counts
.
find
(
ss
);
if
(
it
==
source_set_counts
.
end
())
source_set_counts
[
ss
]
=
1
;
else
it
->
second
++
;
}
// loop over the source state sets
for
(
map
<
size_t
,
size_t
>::
iterator
it
=
source_set_counts
.
begin
();
it
!=
source_set_counts
.
end
();
it
++
)
{
size_t
B
=
it
->
first
;
// do not try to split sets with just 1 element
//if (posrange[B].to - posrange[B].from < it->second)
if
(
posrange
[
B
].
to
-
posrange
[
B
].
from
>
it
->
second
)
split
(
B
,
T1
,
C
);
}
}
}
#ifdef TEST
fprintf
(
stderr
,
"
\n
Partition:
\n
"
);
print_partition
(
stderr
);
#endif
return
*
new
Transducer
(
transducer
,
state2set
,
transtab
.
nodenumbering
,
posrange
.
size
()
);
}
/*******************************************************************/
/* */
/* Transducer::build_TT */
/* */
/*******************************************************************/
void
Transducer
::
build_TT
(
Node
*
node
,
vector
<
Transition
>
&
transtab
,
NodeNumbering
&
nn
)
{
if
(
!
node
->
was_visited
(
vmark
))
{
size_t
sourceID
=
nn
[
node
];
for
(
ArcsIter
p
(
node
->
arcs
());
p
;
p
++
)
{
Arc
*
arc
=
p
;
build_TT
(
arc
->
target_node
(),
transtab
,
nn
);
size_t
targetID
=
nn
[
arc
->
target_node
()];
transtab
.
push_back
(
Transition
(
sourceID
,
arc
->
label
(),
targetID
)
);
}
}
}
/*******************************************************************/
/* */
/* Transducer::build_transtab */
/* */
/*******************************************************************/
void
Transducer
::
build_transtab
(
vector
<
Transition
>
&
tt
,
NodeNumbering
&
nn
)
{
incr_vmark
();
build_TT
(
root_node
(),
tt
,
nn
);
}
/*******************************************************************/
/* */
/* Transducer::Transducer */
/* */
/* builds the result transducer after minimisation */
/* */
/*******************************************************************/
Transducer
::
Transducer
(
Transducer
&
t
,
vector
<
size_t
>
&
state2set
,
NodeNumbering
&
nn
,
size_t
num_of_new_nodes
)
{
vmark
=
0
;
deterministic
=
minimised
=
true
;
alphabet
.
copy
(
t
.
alphabet
);
// Choose a representative for each state set
vector
<
Node
*>
representative
(
num_of_new_nodes
);
for
(
size_t
i
=
0
;
i
<
state2set
.
size
();
i
++
)
representative
[
state2set
[
i
]]
=
nn
.
get_node
(
i
);
// Create the new nodes
vector
<
Node
*>
node
(
num_of_new_nodes
,
NULL
);
// define the root node
node
[
state2set
[
0
]]
=
root_node
();
for
(
size_t
i
=
0
;
i
<
node
.
size
();
i
++
)
if
(
node
[
i
]
==
NULL
)
node
[
i
]
=
new_node
();
// Add the transitions
for
(
size_t
i
=
0
;
i
<
node
.
size
();
i
++
)
{
Node
*
old_node
=
representative
[
i
];
Node
*
new_node
=
node
[
i
];
new_node
->
set_final
(
old_node
->
is_final
()
);
for
(
ArcsIter
p
(
old_node
->
arcs
());
p
;
p
++
)
{
Arc
*
arc
=
p
;
// Compute the target node: Map the node to its index,
// the index to its set, and the set to the new node
Node
*
target
=
node
[
state2set
[
nn
[
arc
->
target_node
()]]];
// Insert the transition
new_node
->
add_arc
(
arc
->
label
(),
target
,
this
);
#ifdef TEST
fprintf
(
stderr
,
"arc: %lu -%s-> %lu
\n
"
,
i
,
alphabet
.
write_label
(
arc
->
label
()),
state2set
[
nn
[
arc
->
target_node
()]]);
#endif
}
}
}
/*******************************************************************/
/* */
/* Transducer::minimise */
/* */
/*******************************************************************/
Transducer
&
Transducer
::
hopcroft_minimise
(
bool
verbose
)
{
if
(
minimised
)
return
copy
();
Transducer
*
a1
=
&
reverse
();
Transducer
*
a2
=
&
a1
->
reverse
();
delete
a1
;
a1
=
&
a2
->
determinise
();
delete
a2
;
Transducer
*
result
=
&
Minimiser
(
*
a1
).
result
();
delete
a1
;
result
->
minimised
=
true
;
result
->
minimise_alphabet
();
return
*
result
;
}
}
src/main/resources/SMOR/src/hopcroft.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/interface.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE interface.C */
/* MODULE interface */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#include <fstream>
#include <set>
#include "interface.h"
using
std
::
ifstream
;
using
std
::
vector
;
namespace
SFST
{
/*******************************************************************/
/* */
/* error */
/* */
/*******************************************************************/
void
error
(
const
char
*
message
)
{
cerr
<<
"
\n
Error: "
<<
message
<<
"
\n
aborted.
\n
"
;
exit
(
1
);
}
/*******************************************************************/
/* */
/* error2 */
/* */
/*******************************************************************/
void
error2
(
const
char
*
message
,
char
*
input
)
{
cerr
<<
"
\n
Error: "
<<
message
<<
": "
<<
input
<<
"
\n
aborted.
\n
"
;
exit
(
1
);
}
/*******************************************************************/
/* */
/* Interface::symbol_code */
/* */
/*******************************************************************/
Character
Interface
::
symbol_code
(
char
*
symbol
)
{
int
c
=
TheAlphabet
.
symbol2code
(
symbol
);
if
(
c
==
EOF
)
c
=
TheAlphabet
.
add_symbol
(
symbol
);
free
(
symbol
);
return
(
Character
)
c
;
}
/*******************************************************************/
/* */
/* Interface::character_code */
/* */
/*******************************************************************/
Character
Interface
::
character_code
(
unsigned
int
uc
)
{
if
(
TheAlphabet
.
utf8
)
return
symbol_code
(
fst_strdup
(
int2utf8
(
uc
)));
unsigned
char
*
buffer
=
(
unsigned
char
*
)
malloc
(
2
);
buffer
[
0
]
=
(
unsigned
char
)
uc
;
buffer
[
1
]
=
0
;
return
symbol_code
((
char
*
)
buffer
);
}
/*******************************************************************/
/* */
/* Interface::add_value */
/* */
/*******************************************************************/
Range
*
Interface
::
add_value
(
Character
c
,
Range
*
r
)
{
Range
*
result
=
new
Range
;
result
->
character
=
c
;
result
->
next
=
r
;
return
result
;
}
/*******************************************************************/
/* */
/* Interface::add_values */
/* */
/*******************************************************************/
Range
*
Interface
::
add_values
(
unsigned
int
c1
,
unsigned
int
c2
,
Range
*
r
)
{
for
(
unsigned
int
c
=
c2
;
c
>=
c1
;
c
--
)
r
=
add_value
(
character_code
(
c
),
r
);
return
r
;
}
/*******************************************************************/
/* */
/* Interface::append_values */
/* */
/*******************************************************************/
Range
*
Interface
::
append_values
(
Range
*
r2
,
Range
*
r
)
{
if
(
r2
==
NULL
)
return
r
;
return
add_value
(
r2
->
character
,
append_values
(
r2
->
next
,
r
));
}
/*******************************************************************/
/* */
/* Interface::add_var_values */
/* */
/*******************************************************************/
Range
*
Interface
::
add_var_values
(
char
*
name
,
Range
*
r
)
{
return
append_values
(
svar_value
(
name
),
r
);
}
/*******************************************************************/
/* */
/* Interface::in_range */
/* */
/*******************************************************************/
bool
Interface
::
in_range
(
unsigned
int
c
,
Range
*
r
)
{
while
(
r
)
{
if
(
r
->
character
==
c
)
return
true
;
r
=
r
->
next
;
}
return
false
;
}
/*******************************************************************/
/* */
/* free_values */
/* */
/*******************************************************************/
static
void
free_values
(
Range
*
r
)
{
if
(
r
)
{
free_values
(
r
->
next
);
delete
r
;
}
}
/*******************************************************************/
/* */
/* free_values */
/* */
/*******************************************************************/
static
void
free_values
(
Ranges
*
r
)
{
if
(
r
)
{
free_values
(
r
->
next
);
delete
r
;
}
}
/*******************************************************************/
/* */
/* free_contexts */
/* */
/*******************************************************************/
static
void
free_contexts
(
Contexts
*
c
)
{
if
(
c
)
{
free_contexts
(
c
->
next
);
delete
c
;
}
}
/*******************************************************************/
/* */
/* Interface::copy_values */
/* */
/*******************************************************************/
Range
*
Interface
::
copy_values
(
const
Range
*
r
)
{
if
(
r
==
NULL
)
return
NULL
;
return
add_value
(
r
->
character
,
copy_values
(
r
->
next
));
}
/*******************************************************************/
/* */
/* Interface::complement_range */
/* */
/*******************************************************************/
Range
*
Interface
::
complement_range
(
Range
*
r
)
{
vector
<
Character
>
sym
;
for
(
Range
*
p
=
r
;
p
;
p
=
p
->
next
)
sym
.
push_back
(
p
->
character
);
free_values
(
r
);
TheAlphabet
.
complement
(
sym
);
if
(
sym
.
size
()
==
0
)
error
(
"Empty character range!"
);
Range
*
result
=
NULL
;
for
(
size_t
i
=
0
;
i
<
sym
.
size
();
i
++
)
{
Range
*
tmp
=
new
Range
;
tmp
->
character
=
sym
[
i
];
tmp
->
next
=
result
;
result
=
tmp
;
}
return
result
;
}
/*******************************************************************/
/* */
/* Interface::make_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
make_transducer
(
Range
*
r1
,
Range
*
r2
)
{
Transducer
*
t
=
new
Transducer
();
Node
*
node
=
t
->
new_node
();
node
->
set_final
(
1
);
if
(
r1
==
NULL
||
r2
==
NULL
)
{
if
(
!
Alphabet_Defined
)
error
(
"The wildcard symbol '.' requires the definition of an alphabet"
);
// one of the ranges was '.'
for
(
Alphabet
::
const_iterator
it
=
TheAlphabet
.
begin
();
it
!=
TheAlphabet
.
end
();
it
++
)
if
((
r1
==
NULL
||
in_range
(
it
->
lower_char
(),
r1
))
&&
(
r2
==
NULL
||
in_range
(
it
->
upper_char
(),
r2
)))
t
->
root_node
()
->
add_arc
(
*
it
,
node
,
t
);
}
else
{
for
(;;)
{
Label
l
(
r1
->
character
,
r2
->
character
);
// TheAlphabet.insert(l);
t
->
root_node
()
->
add_arc
(
l
,
node
,
t
);
if
(
!
r1
->
next
&&
!
r2
->
next
)
break
;
if
(
r1
->
next
)
r1
=
r1
->
next
;
if
(
r2
->
next
)
r2
=
r2
->
next
;
}
}
return
t
;
}
/*******************************************************************/
/* */
/* Interface::empty_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
empty_transducer
()
{
Transducer
*
t
=
new
Transducer
();
t
->
root_node
()
->
set_final
(
1
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::one_label_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
one_label_transducer
(
Label
l
)
{
Transducer
*
t
=
new
Transducer
();
Node
*
last
=
t
->
new_node
();
t
->
root_node
()
->
add_arc
(
l
,
last
,
t
);
last
->
set_final
(
1
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::new_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
new_transducer
(
Range
*
r1
,
Range
*
r2
)
{
Transducer
*
t
=
make_transducer
(
r1
,
r2
);
if
(
r1
!=
r2
)
free_values
(
r1
);
free_values
(
r2
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::read_words */
/* */
/*******************************************************************/
Transducer
*
Interface
::
read_words
(
char
*
filename
)
{
if
(
Verbose
)
fprintf
(
stderr
,
"
\n
reading words from %s..."
,
filename
);
ifstream
is
(
filename
);
if
(
!
is
.
is_open
())
{
static
char
message
[
1000
];
sprintf
(
message
,
"Error: Cannot open file
\"
%s
\"
!"
,
filename
);
throw
message
;
}
free
(
filename
);
Transducer
*
t
=
new
Transducer
(
is
,
&
TheAlphabet
,
Verbose
);
is
.
close
();
TheAlphabet
.
insert_symbols
(
t
->
alphabet
);
if
(
Verbose
)
fprintf
(
stderr
,
"finished
\n
"
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::read_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
read_transducer
(
char
*
filename
)
{
if
(
Verbose
)
fprintf
(
stderr
,
"
\n
reading transducer from %s..."
,
filename
);
FILE
*
file
=
fopen
(
filename
,
"rb"
);
if
(
file
==
NULL
)
{
static
char
message
[
1000
];
sprintf
(
message
,
"Error: Cannot open file
\"
%s
\"
!"
,
filename
);
throw
message
;
}
Transducer
t
(
file
);
fclose
(
file
);
if
(
t
.
alphabet
.
utf8
!=
TheAlphabet
.
utf8
)
{
static
char
message
[
1000
];
sprintf
(
message
,
"Error: incompatible character encoding in file
\"
%s
\"
!"
,
filename
);
throw
message
;
}
free
(
filename
);
Transducer
*
nt
=
&
t
.
copy
(
false
,
&
TheAlphabet
);
TheAlphabet
.
insert_symbols
(
nt
->
alphabet
);
if
(
Verbose
)
fprintf
(
stderr
,
"finished
\n
"
);
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::def_alphabet */
/* */
/*******************************************************************/
void
Interface
::
def_alphabet
(
Transducer
*
t
)
{
t
=
explode
(
t
);
t
=
minimise
(
t
);
t
->
alphabet
.
clear_char_pairs
();
t
->
complete_alphabet
();
TheAlphabet
.
clear_char_pairs
();
TheAlphabet
.
copy
(
t
->
alphabet
);
Alphabet_Defined
=
1
;
delete
t
;
}
/*******************************************************************/
/* */
/* Interface::def_svar definition of a value range variable */
/* */
/*******************************************************************/
bool
Interface
::
def_svar
(
char
*
name
,
Range
*
r
)
{
// delete the old value of the variable
SVarMap
::
iterator
it
=
SVM
.
find
(
name
);
if
(
it
!=
SVM
.
end
())
{
char
*
n
=
it
->
first
;
Range
*
v
=
it
->
second
;
SVM
.
erase
(
it
);
delete
v
;
free
(
n
);
}
SVM
[
name
]
=
r
;
return
r
==
NULL
;
}
/*******************************************************************/
/* */
/* Interface::svar_value */
/* */
/*******************************************************************/
Range
*
Interface
::
svar_value
(
char
*
name
)
{
SVarMap
::
iterator
it
=
SVM
.
find
(
name
);
if
(
it
==
SVM
.
end
())
error2
(
"undefined variable"
,
name
);
free
(
name
);
return
copy_values
(
it
->
second
);
}
/*******************************************************************/
/* */
/* Interface::rsvar_value */
/* */
/*******************************************************************/
Range
*
Interface
::
rsvar_value
(
char
*
name
)
{
if
(
RSS
.
find
(
name
)
==
RSS
.
end
())
RSS
.
insert
(
fst_strdup
(
name
));
return
add_value
(
symbol_code
(
name
),
NULL
);
}
/*******************************************************************/
/* */
/* Interface::def_var definition of a transducer variable */
/* */
/*******************************************************************/
bool
Interface
::
def_var
(
char
*
name
,
Transducer
*
t
)
{
// delete the old value of the variable
VarMap
::
iterator
it
=
VM
.
find
(
name
);
if
(
it
!=
VM
.
end
())
{
char
*
n
=
it
->
first
;
Transducer
*
v
=
it
->
second
;
VM
.
erase
(
it
);
delete
v
;
free
(
n
);
}
t
=
explode
(
t
);
t
=
minimise
(
t
);
VM
[
name
]
=
t
;
return
t
->
is_empty
();
}
/*******************************************************************/
/* */
/* Interface::def_rvar */
/* definition of an agreement variable for automata */
/* */
/*******************************************************************/
bool
Interface
::
def_rvar
(
char
*
name
,
Transducer
*
t
)
{
if
(
t
->
is_cyclic
())
error2
(
"cyclic transducer assigned to"
,
name
);
return
def_var
(
name
,
t
);
}
/*******************************************************************/
/* */
/* Interface::var_value */
/* */
/*******************************************************************/
Transducer
*
Interface
::
var_value
(
char
*
name
)
{
VarMap
::
iterator
it
=
VM
.
find
(
name
);
if
(
it
==
VM
.
end
())
error2
(
"undefined variable"
,
name
);
free
(
name
);
return
&
(
it
->
second
->
copy
());
}
/*******************************************************************/
/* */
/* Interface::rvar_value */
/* */
/*******************************************************************/
Transducer
*
Interface
::
rvar_value
(
char
*
name
)
{
if
(
RS
.
find
(
name
)
==
RS
.
end
())
RS
.
insert
(
fst_strdup
(
name
));
Range
*
r
=
add_value
(
symbol_code
(
name
),
NULL
);
return
new_transducer
(
r
,
r
);
}
/*******************************************************************/
/* */
/* Interface::explode */
/* */
/*******************************************************************/
Transducer
*
Interface
::
explode
(
Transducer
*
t
)
{
if
(
RS
.
size
()
==
0
&&
RSS
.
size
()
==
0
)
return
t
;
t
=
minimise
(
t
);
vector
<
char
*>
name
;
for
(
RVarSet
::
iterator
it
=
RS
.
begin
();
it
!=
RS
.
end
();
it
++
)
name
.
push_back
(
*
it
);
RS
.
clear
();
// replace all agreement variables
for
(
size_t
i
=
0
;
i
<
name
.
size
();
i
++
)
{
Transducer
*
nt
=
NULL
;
Label
l
((
Character
)
TheAlphabet
.
symbol2code
(
name
[
i
]));
Transducer
*
vt
=
var_value
(
name
[
i
]);
// enumerate all paths of the transducer
vector
<
Transducer
*>
it
;
vt
->
enumerate_paths
(
it
);
delete
vt
;
// insert each path
for
(
size_t
i
=
0
;
i
<
it
.
size
();
i
++
)
{
// insertion
Transducer
*
t1
=
&
t
->
splice
(
l
,
it
[
i
]);
delete
it
[
i
];
if
(
nt
==
NULL
)
nt
=
t1
;
else
nt
=
disjunction
(
nt
,
t1
);
}
delete
t
;
t
=
nt
;
}
name
.
clear
();
for
(
RVarSet
::
iterator
it
=
RSS
.
begin
();
it
!=
RSS
.
end
();
it
++
)
name
.
push_back
(
*
it
);
RSS
.
clear
();
// replace all agreement variables
for
(
size_t
i
=
0
;
i
<
name
.
size
();
i
++
)
{
Transducer
*
nt
=
NULL
;
Character
c
=
(
Character
)
TheAlphabet
.
symbol2code
(
name
[
i
]);
Range
*
r
=
svar_value
(
name
[
i
]);
// insert each character
while
(
r
!=
NULL
)
{
// insertion
Transducer
*
t1
=
&
t
->
replace_char
(
c
,
r
->
character
);
if
(
nt
==
NULL
)
nt
=
t1
;
else
nt
=
disjunction
(
nt
,
t1
);
Range
*
next
=
r
->
next
;
delete
r
;
r
=
next
;
}
delete
t
;
t
=
nt
;
}
return
t
;
}
/*******************************************************************/
/* */
/* Interface::catenate */
/* */
/*******************************************************************/
Transducer
*
Interface
::
catenate
(
Transducer
*
t1
,
Transducer
*
t2
)
{
Transducer
*
t
=
&
(
*
t1
+
*
t2
);
delete
t1
;
delete
t2
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::add_range */
/* */
/*******************************************************************/
Ranges
*
Interface
::
add_range
(
Range
*
r
,
Ranges
*
l
)
{
Ranges
*
result
=
new
Ranges
;
result
->
range
=
r
;
result
->
next
=
l
;
return
result
;
}
/*******************************************************************/
/* */
/* Interface::make_mapping */
/* */
/*******************************************************************/
Transducer
*
Interface
::
make_mapping
(
Ranges
*
list1
,
Ranges
*
list2
)
{
Ranges
*
l1
=
list1
;
Ranges
*
l2
=
list2
;
Transducer
*
t
=
new
Transducer
();
Node
*
node
=
t
->
root_node
();
while
(
l1
&&
l2
)
{
Node
*
nn
=
t
->
new_node
();
for
(
Range
*
r1
=
l1
->
range
;
r1
;
r1
=
r1
->
next
)
for
(
Range
*
r2
=
l2
->
range
;
r2
;
r2
=
r2
->
next
)
node
->
add_arc
(
Label
(
r1
->
character
,
r2
->
character
),
nn
,
t
);
node
=
nn
;
l1
=
l1
->
next
;
l2
=
l2
->
next
;
}
while
(
l1
)
{
Node
*
nn
=
t
->
new_node
();
for
(
Range
*
r1
=
l1
->
range
;
r1
;
r1
=
r1
->
next
)
node
->
add_arc
(
Label
(
r1
->
character
,
Label
::
epsilon
),
nn
,
t
);
node
=
nn
;
l1
=
l1
->
next
;
}
while
(
l2
)
{
Node
*
nn
=
t
->
new_node
();
for
(
Range
*
r2
=
l2
->
range
;
r2
;
r2
=
r2
->
next
)
node
->
add_arc
(
Label
(
Label
::
epsilon
,
r2
->
character
),
nn
,
t
);
node
=
nn
;
l2
=
l2
->
next
;
}
node
->
set_final
(
1
);
free_values
(
list1
);
free_values
(
list2
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::disjunction */
/* */
/*******************************************************************/
Transducer
*
Interface
::
disjunction
(
Transducer
*
t1
,
Transducer
*
t2
)
{
Transducer
*
t
=
&
(
*
t1
|
*
t2
);
delete
t1
;
delete
t2
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::conjunction */
/* */
/*******************************************************************/
Transducer
*
Interface
::
conjunction
(
Transducer
*
t1
,
Transducer
*
t2
)
{
if
(
RS
.
size
()
>
0
||
RSS
.
size
()
>
0
)
cerr
<<
"
\n
Warning: agreement operation inside of conjunction!
\n
"
;
Transducer
*
t
=
&
(
*
t1
&
*
t2
);
delete
t1
;
delete
t2
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::subtraction */
/* */
/*******************************************************************/
Transducer
*
Interface
::
subtraction
(
Transducer
*
t1
,
Transducer
*
t2
)
{
if
(
RS
.
size
()
>
0
||
RSS
.
size
()
>
0
)
cerr
<<
"
\n
Warning: agreement operation inside of conjunction!
\n
"
;
Transducer
*
t
=
&
(
*
t1
/
*
t2
);
delete
t1
;
delete
t2
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::composition */
/* */
/*******************************************************************/
Transducer
*
Interface
::
composition
(
Transducer
*
t1
,
Transducer
*
t2
)
{
if
(
RS
.
size
()
>
0
||
RSS
.
size
()
>
0
)
cerr
<<
"
\n
Warning: agreement operation inside of composition!
\n
"
;
Transducer
*
t
=
&
(
*
t1
||
*
t2
);
delete
t1
;
delete
t2
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::freely_insert */
/* */
/*******************************************************************/
Transducer
*
Interface
::
freely_insert
(
Transducer
*
t
,
Character
lc
,
Character
uc
)
{
return
&
t
->
freely_insert
(
Label
(
lc
,
uc
));
}
/*******************************************************************/
/* */
/* Interface::negation */
/* */
/*******************************************************************/
Transducer
*
Interface
::
negation
(
Transducer
*
t
)
{
if
(
RS
.
size
()
>
0
||
RSS
.
size
()
>
0
)
cerr
<<
"
\n
Warning: agreement operation inside of negation!
\n
"
;
if
(
!
Alphabet_Defined
)
error
(
"Negation requires the definition of an alphabet"
);
t
->
alphabet
.
clear_char_pairs
();
t
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
nt
=
&
(
!*
t
);
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::upper_level */
/* */
/*******************************************************************/
Transducer
*
Interface
::
upper_level
(
Transducer
*
t
)
{
Transducer
*
nt
=
&
t
->
upper_level
();
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::lower_level */
/* */
/*******************************************************************/
Transducer
*
Interface
::
lower_level
(
Transducer
*
t
)
{
Transducer
*
nt
=
&
t
->
lower_level
();
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::minimise */
/* */
/*******************************************************************/
Transducer
*
Interface
::
minimise
(
Transducer
*
t
)
{
t
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
nt
=
&
t
->
minimise
(
Verbose
);
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::switch_levels */
/* */
/*******************************************************************/
Transducer
*
Interface
::
switch_levels
(
Transducer
*
t
)
{
Transducer
*
nt
=
&
t
->
switch_levels
();
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::repetition */
/* */
/*******************************************************************/
Transducer
*
Interface
::
repetition
(
Transducer
*
t
)
{
Transducer
*
nt
=
&
(
t
->
kleene_star
());
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::repetition2 */
/* */
/*******************************************************************/
Transducer
*
Interface
::
repetition2
(
Transducer
*
t
)
{
Transducer
*
t1
=
&
(
t
->
kleene_star
());
Transducer
*
nt
=
&
(
*
t
+
*
t1
);
delete
t
;
delete
t1
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::optional */
/* */
/*******************************************************************/
Transducer
*
Interface
::
optional
(
Transducer
*
t
)
{
Transducer
*
nt
=
&
(
t
->
copy
());
nt
->
root_node
()
->
set_final
(
1
);
delete
t
;
return
nt
;
}
/*******************************************************************/
/* */
/* Interface::add_pi_transitions */
/* */
/*******************************************************************/
void
Interface
::
add_pi_transitions
(
Transducer
*
t
,
Node
*
node
,
Alphabet
&
alph
)
{
for
(
Alphabet
::
const_iterator
it
=
alph
.
begin
();
it
!=
alph
.
end
();
it
++
)
node
->
add_arc
(
*
it
,
node
,
t
);
}
/*******************************************************************/
/* */
/* Interface::pi_machine */
/* */
/*******************************************************************/
Transducer
*
Interface
::
pi_machine
(
Alphabet
&
alph
)
{
Transducer
*
t
=
new
Transducer
();
t
->
root_node
()
->
set_final
(
1
);
add_pi_transitions
(
t
,
t
->
root_node
(),
alph
);
return
t
;
}
/*******************************************************************/
/* */
/* Interface::cp */
/* */
/*******************************************************************/
Transducer
*
Interface
::
cp
(
Range
*
lower_range
,
Range
*
upper_range
)
{
return
make_transducer
(
lower_range
,
upper_range
);
}
/*******************************************************************/
/* */
/* Interface::anti_cp */
/* */
/*******************************************************************/
Transducer
*
Interface
::
anti_cp
(
Range
*
lower_range
,
Range
*
upper_range
)
{
Transducer
*
cpt
=
cp
(
lower_range
,
upper_range
);
Transducer
*
t
=
new
Transducer
();
Node
*
node
=
t
->
new_node
();
node
->
set_final
(
1
);
for
(
Alphabet
::
const_iterator
it
=
TheAlphabet
.
begin
();
it
!=
TheAlphabet
.
end
();
it
++
){
Label
l
=*
it
;
if
(
in_range
(
l
.
lower_char
(),
lower_range
)
&&
!
cpt
->
root_node
()
->
target_node
(
l
))
t
->
root_node
()
->
add_arc
(
l
,
node
,
t
);
}
if
(
in_range
(
Label
::
epsilon
,
lower_range
)
&&
!
cpt
->
root_node
()
->
target_node
(
Label
()))
t
->
root_node
()
->
add_arc
(
Label
(),
node
,
t
);
delete
cpt
;
return
t
;
}
/*******************************************************************/
/* */
/* Interface::twol_right_rule */
/* */
/*******************************************************************/
Transducer
*
Interface
::
twol_right_rule
(
Transducer
*
lc
,
Range
*
lower_range
,
Range
*
upper_range
,
Transducer
*
rc
)
{
// Build the rule transducer
Transducer
*
cpt
=
cp
(
lower_range
,
upper_range
);
Transducer
*
pi
=
pi_machine
(
TheAlphabet
);
// First unwanted language
lc
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
notlc
=
&
(
!*
lc
);
Transducer
*
tmp
=
&
(
*
notlc
+
*
cpt
);
delete
notlc
;
Transducer
*
t1
=
&
(
*
tmp
+
*
pi
);
delete
tmp
;
// Second unwanted language
rc
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
notrc
=
&
(
!*
rc
);
tmp
=
&
(
*
cpt
+
*
notrc
);
delete
cpt
;
delete
notrc
;
Transducer
*
t2
=
&
(
*
pi
+
*
tmp
);
delete
pi
;
delete
tmp
;
tmp
=
&
(
*
t1
|*
t2
);
delete
t1
;
delete
t2
;
tmp
->
alphabet
.
copy
(
TheAlphabet
);
t1
=
&
(
!*
tmp
);
delete
tmp
;
return
t1
;
}
/*******************************************************************/
/* */
/* Interface::twol_left_rule */
/* */
/*******************************************************************/
Transducer
*
Interface
::
twol_left_rule
(
Transducer
*
lc
,
Range
*
lower_range
,
Range
*
upper_range
,
Transducer
*
rc
)
{
// check for problematic insertion operations like "$L <> <= a $R"
// where either $L or $R includes the empty string
if
(
in_range
(
Label
::
epsilon
,
lower_range
))
{
if
(
lc
->
generates_empty_string
())
error
(
"in two level rule: insertion operation with deletable left context!"
);
if
(
rc
->
generates_empty_string
())
error
(
"in two level rule: insertion operation with deletable right context!"
);
cerr
<<
"
\n
Warning: two level rule used for insertion operation (might produce unexpected results)
\n
"
;
}
// Build the rule transducer
Transducer
*
t1
=
anti_cp
(
lower_range
,
upper_range
);
// Add the left context;
Transducer
*
t2
=
&
(
*
lc
+
*
t1
);
delete
t1
;
// Add the right context;
t1
=
&
(
*
t2
+
*
rc
);
delete
t2
;
// Form the complement
t1
->
alphabet
.
copy
(
TheAlphabet
);
t2
=
&
(
!*
t1
);
delete
t1
;
return
t2
;
}
/*******************************************************************/
/* */
/* Interface::make_rule */
/* */
/*******************************************************************/
Transducer
*
Interface
::
make_rule
(
Transducer
*
lc
,
Range
*
lower_range
,
Twol_Type
type
,
Range
*
upper_range
,
Transducer
*
rc
)
{
if
(
RS
.
size
()
>
0
||
RSS
.
size
()
>
0
)
cerr
<<
"
\n
Warning: agreement operation inside of replacement rule!
\n
"
;
if
(
!
Alphabet_Defined
)
error
(
"Two level rules require the definition of an alphabet"
);
// expand the left and the right contexts to their full length
Transducer
*
pi
=
pi_machine
(
TheAlphabet
);
if
(
lc
==
NULL
)
lc
=
pi_machine
(
TheAlphabet
);
else
{
Transducer
*
tmp
=
&
(
*
pi
+
*
lc
);
delete
lc
;
lc
=
tmp
;
}
if
(
rc
==
NULL
)
rc
=
pi_machine
(
TheAlphabet
);
else
{
Transducer
*
tmp
=
&
(
*
rc
+
*
pi
);
delete
rc
;
rc
=
tmp
;
}
delete
pi
;
Transducer
*
result
=
NULL
;
switch
(
type
)
{
case
twol_left
:
result
=
twol_left_rule
(
lc
,
lower_range
,
upper_range
,
rc
);
break
;
case
twol_right
:
result
=
twol_right_rule
(
lc
,
lower_range
,
upper_range
,
rc
);
break
;
case
twol_both
:
{
Transducer
*
t1
=
twol_left_rule
(
lc
,
lower_range
,
upper_range
,
rc
);
Transducer
*
t2
=
twol_right_rule
(
lc
,
lower_range
,
upper_range
,
rc
);
result
=
&
(
*
t1
&
*
t2
);
delete
t1
;
delete
t2
;
}
}
delete
lc
;
delete
rc
;
if
(
lower_range
!=
upper_range
)
free_values
(
lower_range
);
free_values
(
upper_range
);
return
minimise
(
result
);
}
/*******************************************************************/
/* */
/* Interface::make_context */
/* */
/*******************************************************************/
Contexts
*
Interface
::
make_context
(
Transducer
*
l
,
Transducer
*
r
)
{
if
(
l
==
NULL
)
l
=
empty_transducer
();
if
(
r
==
NULL
)
r
=
empty_transducer
();
Contexts
*
c
=
new
Contexts
();
c
->
left
=
l
;
c
->
right
=
r
;
c
->
next
=
NULL
;
return
c
;
}
/*******************************************************************/
/* */
/* Interface::add_context */
/* */
/*******************************************************************/
Contexts
*
Interface
::
add_context
(
Contexts
*
nc
,
Contexts
*
c
)
{
nc
->
next
=
c
;
return
nc
;
}
/*******************************************************************/
/* */
/* Interface::restriction_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
restriction_transducer
(
Transducer
*
l1
,
Transducer
*
l2
,
Character
marker
)
{
l1
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
t1
=
&
(
*
l1
/
*
l2
);
Transducer
*
t2
=
&
t1
->
replace_char
(
marker
,
Label
::
epsilon
);
delete
t1
;
t2
->
alphabet
.
copy
(
TheAlphabet
);
t1
=
&
(
!*
t2
);
delete
t2
;
return
t1
;
}
/*******************************************************************/
/* */
/* Interface::marker_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
marker_transducer
(
Transducer
*
t
,
Contexts
*
c
,
Character
&
marker
)
{
marker
=
TheAlphabet
.
new_marker
();
Transducer
*
result
=
one_label_transducer
(
Label
(
marker
)
);
// build the alphabet with a new marker
result
->
alphabet
.
insert_symbols
(
t
->
alphabet
);
while
(
c
)
{
result
->
alphabet
.
insert_symbols
(
c
->
left
->
alphabet
);
result
->
alphabet
.
insert_symbols
(
c
->
right
->
alphabet
);
c
=
c
->
next
;
}
return
result
;
}
/*******************************************************************/
/* */
/* Interface::center_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
center_transducer
(
Transducer
*
t
,
Transducer
*
pi
,
Transducer
*
mt
)
{
// create the concatenation pi + mt + *t + mt + pi
Transducer
*
t1
=&
(
*
pi
+
*
mt
);
Transducer
*
t2
=&
(
*
t1
+
*
t
);
delete
t1
;
t1
=
&
(
*
t2
+
*
mt
);
delete
t2
;
t2
=
&
(
*
t1
+
*
pi
);
delete
t1
;
return
t2
;
}
/*******************************************************************/
/* */
/* Interface::context_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
context_transducer
(
Transducer
*
t
,
Transducer
*
pi
,
Transducer
*
mt
,
Contexts
*
c
)
{
// pi + left[i] + mt + pi + mt + right[i] + pi
Transducer
*
t1
=
&
(
*
mt
+
*
t
);
Transducer
*
tmp
=
&
(
*
t1
+
*
mt
);
delete
t1
;
Transducer
*
result
=
NULL
;
while
(
c
)
{
t1
=
&
(
*
pi
+
*
c
->
left
);
Transducer
*
t2
=
&
(
*
t1
+
*
tmp
);
delete
t1
;
t1
=
&
(
*
t2
+
*
c
->
right
);
delete
t2
;
t2
=
&
(
*
t1
+
*
pi
);
delete
t1
;
if
(
result
)
{
t1
=
&
(
*
result
|
*
t2
);
delete
t2
;
result
=
t1
;
}
else
result
=
t2
;
c
=
c
->
next
;
}
delete
tmp
;
return
result
;
}
/*******************************************************************/
/* */
/* Interface::result_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
result_transducer
(
Transducer
*
l1
,
Transducer
*
l2
,
Twol_Type
type
,
Character
marker
)
{
Transducer
*
result
=
NULL
;
if
(
type
==
twol_right
)
result
=
restriction_transducer
(
l1
,
l2
,
marker
);
else
if
(
type
==
twol_left
)
result
=
restriction_transducer
(
l2
,
l1
,
marker
);
else
if
(
type
==
twol_both
)
{
Transducer
*
t1
=
restriction_transducer
(
l1
,
l2
,
marker
);
Transducer
*
t2
=
restriction_transducer
(
l2
,
l1
,
marker
);
result
=
&
(
*
t1
&
*
t2
);
delete
t1
;
delete
t2
;
}
return
result
;
}
/*******************************************************************/
/* */
/* Interface::restriction */
/* */
/*******************************************************************/
Transducer
*
Interface
::
restriction
(
Transducer
*
t
,
Twol_Type
type
,
Contexts
*
c
,
int
direction
)
{
Character
marker
;
Transducer
*
mt
=
marker_transducer
(
t
,
c
,
marker
);
Transducer
*
pi
=
pi_machine
(
TheAlphabet
);
Transducer
*
l1
=
center_transducer
(
t
,
pi
,
mt
);
Transducer
*
tmp
;
if
(
direction
==
0
)
tmp
=
pi
;
else
if
(
direction
==
1
)
{
// compute _t || .*
Transducer
*
t1
=
&
t
->
lower_level
();
tmp
=
&
(
*
t1
||
*
pi
);
delete
t1
;
}
else
{
// compute ^t || .*
Transducer
*
t1
=
&
t
->
upper_level
();
tmp
=
&
(
*
pi
||
*
t1
);
delete
t1
;
}
delete
t
;
Transducer
*
l2
=
context_transducer
(
tmp
,
pi
,
mt
,
c
);
if
(
tmp
!=
pi
)
delete
tmp
;
delete
pi
;
delete
mt
;
Transducer
*
result
=
result_transducer
(
l1
,
l2
,
type
,
marker
);
delete
l1
;
delete
l2
;
free_contexts
(
c
);
return
result
;
}
/*******************************************************************/
/* */
/* Interface::constrain_boundary_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
constrain_boundary_transducer
(
Character
leftm
,
Character
rightm
)
{
// create the transducer (.|<L>|<R>)*
Transducer
*
tmp
=
pi_machine
(
TheAlphabet
);
// create the transducer (.|<L>|<R>)* <L><R> (.|<L>|<R>)*
Node
*
root
=
tmp
->
root_node
();
Node
*
node
=
tmp
->
new_node
();
Node
*
last
=
tmp
->
new_node
();
root
->
set_final
(
0
);
last
->
set_final
(
1
);
root
->
add_arc
(
Label
(
leftm
),
node
,
tmp
);
node
->
add_arc
(
Label
(
rightm
),
last
,
tmp
);
add_pi_transitions
(
tmp
,
last
,
TheAlphabet
);
// create the transducer !((.|<L>|<R>)* <L><R> (.|<L>|<R>)*)
tmp
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
result
=
&
(
!*
tmp
);
delete
tmp
;
return
result
;
}
/*******************************************************************/
/* */
/* Interface::extended_left_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
extended_left_transducer
(
Transducer
*
t
,
Character
m1
,
Character
m2
)
{
if
(
t
==
NULL
)
// empty context
return
pi_machine
(
TheAlphabet
);
// Extended left context transducer
// <R> >> (<L> >> $T$)
Transducer
*
tmp
=&
t
->
freely_insert
(
Label
(
m1
)
);
delete
t
;
t
=
&
tmp
->
freely_insert
(
Label
(
m2
)
);
delete
tmp
;
// .* (<R> >> (<L> >> $T$))
add_pi_transitions
(
t
,
t
->
root_node
(),
TheAlphabet
);
// !(.*<L>)
tmp
=
one_label_transducer
(
Label
(
m1
));
add_pi_transitions
(
tmp
,
tmp
->
root_node
(),
TheAlphabet
);
tmp
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
t2
=
&
(
!*
tmp
);
delete
tmp
;
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
tmp
=
&
(
*
t
||
*
t2
);
delete
t
;
delete
t2
;
return
tmp
;
}
/*******************************************************************/
/* */
/* Interface::left_context */
/* */
/*******************************************************************/
Transducer
*
Interface
::
left_context
(
Transducer
*
t
,
Character
m1
,
Character
m2
)
{
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
Transducer
*
ct
=
extended_left_transducer
(
t
,
m1
,
m2
);
// <R>* <L> .*
Transducer
*
mt
=
one_label_transducer
(
Label
(
m1
));
mt
->
root_node
()
->
add_arc
(
Label
(
m2
),
mt
->
root_node
(),
mt
);
add_pi_transitions
(
mt
,
mt
->
root_node
()
->
target_node
(
Label
(
m1
)),
TheAlphabet
);
ct
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
no_ct
=
&!*
ct
;
mt
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
no_mt
=
&!*
mt
;
{
static
int
print
=
1
;
if
(
print
)
{
print
=
0
;
Transducer
*
temp
=
&
(
ct
->
copy
());
temp
=
&
(
no_ct
->
copy
());
temp
=
&
(
mt
->
copy
());
temp
=
&
(
no_mt
->
copy
());
}
}
Transducer
*
t1
=
&
(
*
no_ct
+
*
mt
);
delete
no_ct
;
delete
mt
;
Transducer
*
t2
=
&
(
*
ct
+
*
no_mt
);
delete
ct
;
delete
no_mt
;
Transducer
*
tmp
=
&
(
*
t1
|
*
t2
);
delete
t1
;
delete
t2
;
tmp
->
alphabet
.
copy
(
TheAlphabet
);
t1
=
&!*
tmp
;
delete
tmp
;
return
t1
;
}
/*******************************************************************/
/* */
/* Interface::make_optional */
/* */
/*******************************************************************/
Transducer
*
Interface
::
make_optional
(
Transducer
*
t
)
{
Transducer
*
t1
=
pi_machine
(
TheAlphabet
);
Transducer
*
t2
=
&
(
*
t
|
*
t1
);
delete
t
;
delete
t1
;
return
t2
;
}
/*******************************************************************/
/* */
/* Interface::replace */
/* */
/*******************************************************************/
Transducer
*
Interface
::
replace
(
Transducer
*
ct
,
Repl_Type
type
,
bool
optional
)
{
// compute the no-center transducer
Transducer
*
tmp
=
NULL
;
if
(
type
==
repl_up
)
// _ct
tmp
=
&
ct
->
lower_level
();
else
if
(
type
==
repl_down
)
// ^ct
tmp
=
&
ct
->
upper_level
();
else
error
(
"Invalid type of replace operator"
);
// .* _ct
add_pi_transitions
(
tmp
,
tmp
->
root_node
(),
TheAlphabet
);
// .* _ct .*
Transducer
*
t2
=
pi_machine
(
TheAlphabet
);
Transducer
*
t3
=
&
(
*
tmp
+
*
t2
);
delete
tmp
;
delete
t2
;
// no_ct = !(.* _ct .*)
t3
->
alphabet
.
copy
(
TheAlphabet
);
Transducer
*
no_ct
=
&
(
!*
t3
);
delete
t3
;
// compute the unconditional replacement transducer
// no-ct ct
tmp
=
&
(
*
no_ct
+
*
ct
);
delete
ct
;
// (no-ct ct)*
t2
=
&
(
tmp
->
kleene_star
());
delete
tmp
;
// (no-ct ct)* no-ct
tmp
=
&
(
*
t2
+
*
no_ct
);
delete
t2
;
delete
no_ct
;
if
(
optional
)
tmp
=
make_optional
(
tmp
);
return
tmp
;
}
/*******************************************************************/
/* */
/* Interface::replace_transducer */
/* */
/*******************************************************************/
Transducer
*
Interface
::
replace_transducer
(
Transducer
*
ct
,
Character
lm
,
Character
rm
,
Repl_Type
type
)
{
// insert boundary markers into the center transducer
// <L> >> (<R> >> $Center$)
Transducer
*
tmp
=
&
ct
->
freely_insert
(
Label
(
lm
));
delete
ct
;
ct
=
&
tmp
->
freely_insert
(
Label
(
rm
));
delete
tmp
;
// add surrounding boundary markers to the center transducer
// <L> (<L> >> (<R> >> $Center$))
Transducer
*
t2
=
one_label_transducer
(
Label
(
lm
)
);
tmp
=
&
(
*
t2
+
*
ct
);
delete
t2
;
delete
ct
;
// $CenterB$ = <L> (<L> >> (<R> >> $Center$)) <R>
t2
=
one_label_transducer
(
Label
(
rm
)
);
ct
=
&
(
*
tmp
+
*
t2
);
delete
tmp
;
delete
t2
;
return
replace
(
ct
,
type
,
false
);
}
/*******************************************************************/
/* */
/* Interface::replace_in_context */
/* */
/*******************************************************************/
Transducer
*
Interface
::
replace_in_context
(
Transducer
*
t
,
Repl_Type
type
,
Contexts
*
c
,
bool
optional
)
{
// The implementation of the replace operators is based on
// "The Replace Operator" by Lauri Karttunen
if
(
!
Alphabet_Defined
)
error
(
"The replace operators require the definition of an alphabet"
);
if
(
!
c
->
left
->
is_automaton
()
||
!
c
->
right
->
is_automaton
())
error
(
"The replace operators require automata as context expressions!"
);
// create the marker symbols
Character
leftm
=
TheAlphabet
.
new_marker
();
Character
rightm
=
TheAlphabet
.
new_marker
();
/////////////////////////////////////////////////////////////
// Create the insert boundaries transducer (.|<>:<L>|<>:<R>)*
/////////////////////////////////////////////////////////////
Transducer
*
ibt
=
pi_machine
(
TheAlphabet
);
Node
*
root
=
ibt
->
root_node
();
root
->
add_arc
(
Label
(
Label
::
epsilon
,
leftm
),
root
,
ibt
);
root
->
add_arc
(
Label
(
Label
::
epsilon
,
rightm
),
root
,
ibt
);
/////////////////////////////////////////////////////////////
// Create the remove boundaries transducer (.|<L>:<>|<R>:<>)*
/////////////////////////////////////////////////////////////
Transducer
*
rbt
=
pi_machine
(
TheAlphabet
);
root
=
rbt
->
root_node
();
root
->
add_arc
(
Label
(
leftm
,
Label
::
epsilon
),
root
,
rbt
);
root
->
add_arc
(
Label
(
rightm
,
Label
::
epsilon
),
root
,
rbt
);
// Add the markers to the alphabet
TheAlphabet
.
insert
(
Label
(
leftm
));
TheAlphabet
.
insert
(
Label
(
rightm
));
/////////////////////////////////////////////////////////////
// Create the constrain boundaries transducer !(.*<L><R>.*)
/////////////////////////////////////////////////////////////
Transducer
*
cbt
=
constrain_boundary_transducer
(
leftm
,
rightm
);
/////////////////////////////////////////////////////////////
// Create the extended context transducers
/////////////////////////////////////////////////////////////
// left context transducer: .* (<R> >> (<L> >> $T$)) || !(.*<L>)
Transducer
*
lct
=
left_context
(
c
->
left
,
leftm
,
rightm
);
// right context transducer: (<R> >> (<L> >> $T$)) .* || !(<R>.*)
Transducer
*
tmp
=
&
c
->
right
->
reverse
();
delete
c
->
right
;
Transducer
*
t2
=
left_context
(
tmp
,
rightm
,
leftm
);
Transducer
*
rct
=
&
t2
->
reverse
();
delete
t2
;
/////////////////////////////////////////////////////////////
// unconditional replace transducer
/////////////////////////////////////////////////////////////
Transducer
*
rt
;
if
(
type
==
repl_up
||
type
==
repl_right
||
type
==
repl_left
)
rt
=
replace_transducer
(
t
,
leftm
,
rightm
,
repl_up
);
else
rt
=
replace_transducer
(
t
,
leftm
,
rightm
,
repl_down
);
/////////////////////////////////////////////////////////////
// build the conditional replacement transducer
/////////////////////////////////////////////////////////////
tmp
=
&
(
ibt
->
copy
());
tmp
=
&
(
cbt
->
copy
());
tmp
=
&
(
lct
->
copy
());
tmp
=
&
(
rct
->
copy
());
tmp
=
&
(
rt
->
copy
());
tmp
=
&
(
rbt
->
copy
());
tmp
=
ibt
;
tmp
=
&
(
*
ibt
||
*
cbt
);
delete
(
ibt
);
delete
(
cbt
);
if
(
type
==
repl_up
||
type
==
repl_left
)
{
t2
=
&
(
*
tmp
||
*
lct
);
delete
tmp
;
delete
lct
;
tmp
=
t2
;
}
if
(
type
==
repl_up
||
type
==
repl_right
)
{
t2
=
&
(
*
tmp
||
*
rct
);
delete
tmp
;
delete
rct
;
tmp
=
t2
;
}
t2
=
&
(
*
tmp
||
*
rt
);
delete
tmp
;
delete
rt
;
tmp
=
t2
;
if
(
type
==
repl_down
||
type
==
repl_right
)
{
t2
=
&
(
*
tmp
||
*
lct
);
delete
tmp
;
delete
lct
;
tmp
=
t2
;
}
if
(
type
==
repl_down
||
type
==
repl_left
)
{
t2
=
&
(
*
tmp
||
*
rct
);
delete
tmp
;
delete
rct
;
tmp
=
t2
;
}
t2
=
&
(
*
tmp
||
*
rbt
);
delete
tmp
;
delete
rbt
;
// Remove the markers from the alphabet
TheAlphabet
.
delete_markers
();
if
(
optional
)
t2
=
make_optional
(
t2
);
free_contexts
(
c
);
return
t2
;
}
/*******************************************************************/
/* */
/* Interface::add_alphabet */
/* */
/*******************************************************************/
void
Interface
::
add_alphabet
(
Transducer
*
t
)
{
t
->
alphabet
.
copy
(
TheAlphabet
);
t
->
complete_alphabet
();
}
/*******************************************************************/
/* */
/* Interface::write_to_file */
/* */
/*******************************************************************/
void
Interface
::
write_to_file
(
Transducer
*
t
,
char
*
filename
)
{
FILE
*
file
;
if
((
file
=
fopen
(
filename
,
"wb"
))
==
NULL
)
{
fprintf
(
stderr
,
"
\n
Error: Cannot open output file
\"
%s
\"\n\n
"
,
filename
);
exit
(
1
);
}
free
(
filename
);
t
=
explode
(
t
);
add_alphabet
(
t
);
t
=
minimise
(
t
);
t
->
store
(
file
);
fclose
(
file
);
}
/*******************************************************************/
/* */
/* Interface::result */
/* */
/*******************************************************************/
Transducer
*
Interface
::
result
(
Transducer
*
t
,
bool
switch_flag
)
{
t
=
explode
(
t
);
// delete the variable values
vector
<
char
*>
s
;
for
(
VarMap
::
iterator
it
=
VM
.
begin
();
it
!=
VM
.
end
();
it
++
)
{
s
.
push_back
(
it
->
first
);
delete
it
->
second
;
it
->
second
=
NULL
;
}
VM
.
clear
();
for
(
size_t
i
=
0
;
i
<
s
.
size
();
i
++
)
free
(
s
[
i
]);
s
.
clear
();
if
(
switch_flag
)
t
=
switch_levels
(
t
);
add_alphabet
(
t
);
t
=
minimise
(
t
);
return
t
;
}
}
src/main/resources/SMOR/src/interface.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE interface.h */
/* MODULE interface */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#ifndef _INTERFACE_H_
#define _INTERFACE_H_
#include "utf8.h"
#include "fst.h"
using
std
::
set
;
using
std
::
cerr
;
namespace
SFST
{
void
error
(
const
char
*
message
);
void
error2
(
const
char
*
message
,
char
*
input
);
typedef
enum
{
twol_left
,
twol_right
,
twol_both
}
Twol_Type
;
typedef
enum
{
repl_left
,
repl_right
,
repl_up
,
repl_down
}
Repl_Type
;
typedef
struct
range_t
{
Character
character
;
struct
range_t
*
next
;
}
Range
;
typedef
struct
ranges_t
{
Range
*
range
;
struct
ranges_t
*
next
;
}
Ranges
;
typedef
struct
contexts_t
{
Transducer
*
left
,
*
right
;
struct
contexts_t
*
next
;
}
Contexts
;
class
Interface
{
private:
struct
ltstr
{
bool
operator
()(
const
char
*
s1
,
const
char
*
s2
)
const
{
return
strcmp
(
s1
,
s2
)
<
0
;
}
};
struct
eqstr
{
bool
operator
()(
const
char
*
s1
,
const
char
*
s2
)
const
{
return
strcmp
(
s1
,
s2
)
==
0
;
}
};
typedef
set
<
char
*
,
ltstr
>
RVarSet
;
typedef
hash_map
<
char
*
,
Transducer
*
,
hash
<
const
char
*>
,
eqstr
>
VarMap
;
typedef
hash_map
<
char
*
,
Range
*
,
hash
<
const
char
*>
,
eqstr
>
SVarMap
;
Range
*
copy_values
(
const
Range
*
r
);
Transducer
*
empty_transducer
();
Transducer
*
one_label_transducer
(
Label
l
);
void
add_pi_transitions
(
Transducer
*
t
,
Node
*
node
,
Alphabet
&
alph
);
Transducer
*
pi_machine
(
Alphabet
&
alph
);
Transducer
*
cp
(
Range
*
lower_range
,
Range
*
upper_range
);
Transducer
*
anti_cp
(
Range
*
lower_range
,
Range
*
upper_range
);
Transducer
*
twol_right_rule
(
Transducer
*
lc
,
Range
*
lower_range
,
Range
*
upper_range
,
Transducer
*
rc
);
Transducer
*
twol_left_rule
(
Transducer
*
lc
,
Range
*
lower_range
,
Range
*
upper_range
,
Transducer
*
rc
);
Transducer
*
restriction_transducer
(
Transducer
*
l1
,
Transducer
*
l2
,
Character
marker
);
Transducer
*
marker_transducer
(
Transducer
*
t
,
Contexts
*
c
,
Character
&
marker
);
Transducer
*
center_transducer
(
Transducer
*
t
,
Transducer
*
pi
,
Transducer
*
mt
);
Transducer
*
context_transducer
(
Transducer
*
t
,
Transducer
*
pi
,
Transducer
*
mt
,
Contexts
*
c
);
Transducer
*
constrain_boundary_transducer
(
Character
leftm
,
Character
rm
);
Transducer
*
extended_left_transducer
(
Transducer
*
t
,
Character
m1
,
Character
m2
);
Transducer
*
left_context
(
Transducer
*
t
,
Character
m1
,
Character
m2
);
Transducer
*
make_optional
(
Transducer
*
t
);
Transducer
*
replace_transducer
(
Transducer
*
ct
,
Character
lm
,
Character
rm
,
Repl_Type
type
);
Transducer
*
result_transducer
(
Transducer
*
l1
,
Transducer
*
l2
,
Twol_Type
type
,
Character
marker
);
VarMap
VM
;
SVarMap
SVM
;
RVarSet
RS
;
RVarSet
RSS
;
public:
bool
Verbose
;
bool
Alphabet_Defined
;
Alphabet
TheAlphabet
;
Interface
(
bool
utf8
=
false
,
bool
verbose
=
false
)
:
Verbose
(
verbose
),
Alphabet_Defined
(
false
)
{
TheAlphabet
.
utf8
=
utf8
;
}
Transducer
*
new_transducer
(
Range
*
,
Range
*
);
Transducer
*
read_words
(
char
*
filename
);
Transducer
*
read_transducer
(
char
*
filename
);
Transducer
*
var_value
(
char
*
name
);
Transducer
*
rvar_value
(
char
*
name
);
Range
*
svar_value
(
char
*
name
);
Range
*
complement_range
(
Range
*
);
Range
*
rsvar_value
(
char
*
name
);
Character
character_code
(
unsigned
int
uc
);
Character
symbol_code
(
char
*
s
);
bool
in_range
(
unsigned
int
c
,
Range
*
r
);
Transducer
*
make_transducer
(
Range
*
r1
,
Range
*
r2
);
Range
*
add_value
(
Character
,
Range
*
);
Range
*
add_var_values
(
char
*
name
,
Range
*
);
Range
*
add_values
(
unsigned
int
,
unsigned
int
,
Range
*
);
Range
*
append_values
(
Range
*
r2
,
Range
*
r
);
void
add_alphabet
(
Transducer
*
);
// These functions delete their argument automata
void
def_alphabet
(
Transducer
*
a
);
bool
def_var
(
char
*
name
,
Transducer
*
a
);
bool
def_rvar
(
char
*
name
,
Transducer
*
a
);
bool
def_svar
(
char
*
name
,
Range
*
r
);
Transducer
*
explode
(
Transducer
*
a
);
Transducer
*
catenate
(
Transducer
*
a1
,
Transducer
*
a2
);
Transducer
*
disjunction
(
Transducer
*
a1
,
Transducer
*
a2
);
Transducer
*
conjunction
(
Transducer
*
a1
,
Transducer
*
a2
);
Transducer
*
subtraction
(
Transducer
*
a1
,
Transducer
*
a2
);
Transducer
*
composition
(
Transducer
*
a1
,
Transducer
*
a2
);
Transducer
*
restriction
(
Transducer
*
a
,
Twol_Type
type
,
Contexts
*
c
,
int
);
Transducer
*
replace
(
Transducer
*
a
,
Repl_Type
type
,
bool
optional
);
Transducer
*
replace_in_context
(
Transducer
*
a
,
Repl_Type
type
,
Contexts
*
c
,
bool
optional
);
Transducer
*
negation
(
Transducer
*
a
);
Transducer
*
upper_level
(
Transducer
*
a
);
Transducer
*
lower_level
(
Transducer
*
a
);
Transducer
*
minimise
(
Transducer
*
a
);
Transducer
*
switch_levels
(
Transducer
*
a
);
Transducer
*
repetition
(
Transducer
*
a
);
Transducer
*
repetition2
(
Transducer
*
a
);
Transducer
*
optional
(
Transducer
*
a
);
Transducer
*
make_rule
(
Transducer
*
lc
,
Range
*
r1
,
Twol_Type
type
,
Range
*
r2
,
Transducer
*
rc
);
Transducer
*
freely_insert
(
Transducer
*
a
,
Character
lc
,
Character
uc
);
Transducer
*
make_mapping
(
Ranges
*
,
Ranges
*
);
Ranges
*
add_range
(
Range
*
,
Ranges
*
);
Contexts
*
make_context
(
Transducer
*
l
,
Transducer
*
r
);
Contexts
*
add_context
(
Contexts
*
nc
,
Contexts
*
c
);
Transducer
*
result
(
Transducer
*
,
bool
);
void
write_to_file
(
Transducer
*
,
char
*
filename
);
};
}
#endif
src/main/resources/SMOR/src/interface.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/lowmem.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE lowmem.C */
/* MODULE lowmem */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for analysing data */
/* */
/*******************************************************************/
#include <stdio.h>
#include "lowmem.h"
using
std
::
vector
;
namespace
SFST
{
const
int
BUFFER_SIZE
=
1000
;
/*******************************************************************/
/* */
/* LowMemTransducer::analyze */
/* */
/*******************************************************************/
void
LowMemTransducer
::
analyze
(
const
LMNode
&
node
,
vector
<
Character
>
&
input
,
size_t
ipos
,
Analysis
&
ca
,
vector
<
Analysis
>
&
analyses
)
{
if
(
node
.
finalp
&&
ipos
==
input
.
size
())
// store the new analysis
analyses
.
push_back
(
ca
);
// follow the transitions
for
(
int
i
=
0
;
i
<
node
.
number_of_arcs
;
i
++
)
{
ca
.
push_back
(
node
.
arc
[
i
].
label
);
LMNode
target
(
node
.
arc
[
i
].
tnodepos
,
lmafile
);
if
(
node
.
arc
[
i
].
label
.
upper_char
()
==
Label
::
epsilon
)
analyze
(
target
,
input
,
ipos
,
ca
,
analyses
);
else
if
(
ipos
<
input
.
size
()
&&
node
.
arc
[
i
].
label
.
upper_char
()
==
(
Character
)
input
[
ipos
])
analyze
(
target
,
input
,
ipos
+
1
,
ca
,
analyses
);
ca
.
pop_back
();
}
}
/*******************************************************************/
/* */
/* LowMemTransducer::analyze_string */
/* */
/*******************************************************************/
void
LowMemTransducer
::
analyze_string
(
char
*
string
,
vector
<
Analysis
>
&
analyses
)
{
vector
<
Character
>
input
;
alphabet
.
string2symseq
(
string
,
input
);
Analysis
ca
;
analyses
.
clear
();
analyze
(
*
rootnode
,
input
,
0
,
ca
,
analyses
);
if
(
simplest_only
)
alphabet
.
disambiguate
(
analyses
);
}
/*******************************************************************/
/* */
/* LowMemTransducer::LowMemTransducer */
/* */
/*******************************************************************/
LowMemTransducer
::
LowMemTransducer
(
FILE
*
file
)
{
simplest_only
=
false
;
lmafile
=
file
;
if
(
fgetc
(
file
)
!=
'l'
)
throw
"Error: wrong file format (not a lowmem transducer)
\n
"
;
alphabet
.
read
(
file
);
rootnode
=
new
LMNode
(
ftell
(
file
),
lmafile
);
}
}
src/main/resources/SMOR/src/lowmem.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE lowmem.h */
/* MODULE lowmem */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _LOWMEM_H_
#define _LOWMEM_H_
#include <stdio.h>
#include "alphabet.h"
#include <vector>
namespace
SFST
{
/***************** class LMArc ***********************************/
class
LMArc
{
public:
Label
label
;
unsigned
int
tnodepos
;
LMArc
(
void
)
{};
};
/***************** class LMNode **********************************/
class
LMNode
{
public:
bool
finalp
;
unsigned
short
number_of_arcs
;
LMArc
*
arc
;
LMNode
(
long
pos
,
FILE
*
lmafile
)
{
fseek
(
lmafile
,
pos
,
SEEK_SET
);
fread
(
&
finalp
,
sizeof
(
finalp
),
1
,
lmafile
);
fread
(
&
number_of_arcs
,
sizeof
(
number_of_arcs
),
1
,
lmafile
);
arc
=
new
LMArc
[
number_of_arcs
];
for
(
int
i
=
0
;
i
<
(
int
)
number_of_arcs
;
i
++
)
{
Character
lc
,
uc
;
unsigned
int
tpos
;
fread
(
&
lc
,
sizeof
(
lc
),
1
,
lmafile
);
fread
(
&
uc
,
sizeof
(
uc
),
1
,
lmafile
);
fread
(
&
tpos
,
sizeof
(
tpos
),
1
,
lmafile
);
arc
[
i
].
label
=
Label
(
lc
,
uc
);
arc
[
i
].
tnodepos
=
tpos
;
}
};
~
LMNode
()
{
delete
[]
arc
;
};
};
/***************** class LowMemTransducer *************************/
class
LowMemTransducer
{
protected:
void
analyze
(
const
LMNode
&
,
std
::
vector
<
Character
>
&
input
,
size_t
ipos
,
Analysis
&
,
std
::
vector
<
Analysis
>&
);
public:
bool
simplest_only
;
FILE
*
lmafile
;
LMNode
*
rootnode
;
Alphabet
alphabet
;
LowMemTransducer
(
FILE
*
);
~
LowMemTransducer
()
{
delete
rootnode
;
};
void
analyze_string
(
char
*
string
,
std
::
vector
<
Analysis
>
&
analyses
);
};
}
#endif
src/main/resources/SMOR/src/lowmem.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/make-compact.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE make-compact.C */
/* MODULE make-compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for generating compact automata */
/* */
/*******************************************************************/
#include <math.h>
#include "make-compact.h"
namespace
SFST
{
class
ARC
{
public:
int
cv
;
Label
label
;
unsigned
int
target_node
;
bool
operator
<
(
const
ARC
a
)
const
{
return
cv
<
a
.
cv
;
};
};
typedef
hash_map
<
Label
,
size_t
,
Label
::
label_hash
,
Label
::
label_eq
>
LabelNumber
;
/*******************************************************************/
/* */
/* MakeCompactTransducer::sort */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
sort
(
Level
level
)
{
for
(
unsigned
int
n
=
0
;
n
<
number_of_nodes
;
n
++
)
{
unsigned
int
from
=
first_arc
[
n
];
unsigned
int
to
=
first_arc
[
n
+
1
];
int
l
=
to
-
from
;
// copy the arcs to a temporary table
ARC
*
arc
=
new
ARC
[
l
];
for
(
unsigned
int
i
=
from
;
i
<
to
;
i
++
)
{
arc
[
i
-
from
].
cv
=
(
int
)
label
[
i
].
get_char
(
level
);
// make sure that epsilon arcs are stored at the beginning
// even if epsilon is not 0
if
(
arc
[
i
-
from
].
cv
==
(
int
)
Label
::
epsilon
)
arc
[
i
-
from
].
cv
=
-
1
;
arc
[
i
-
from
].
label
=
label
[
i
];
arc
[
i
-
from
].
target_node
=
target_node
[
i
];
}
// sort the table
std
::
sort
(
arc
,
arc
+
l
);
// copy the arcs back to the original table
for
(
unsigned
int
i
=
from
;
i
<
to
;
i
++
)
{
label
[
i
]
=
arc
[
i
-
from
].
label
;
target_node
[
i
]
=
arc
[
i
-
from
].
target_node
;
}
delete
[]
arc
;
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::count_arcs */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
count_arcs
(
Node
*
node
,
NodeNumbering
&
index
,
VType
vmark
)
{
if
(
!
node
->
was_visited
(
vmark
))
{
unsigned
int
n
=
index
[
node
];
finalp
[
n
]
=
node
->
is_final
();
first_arc
[
n
]
=
0
;
Arcs
*
arcs
=
node
->
arcs
();
for
(
ArcsIter
p
(
arcs
);
p
;
p
++
)
{
Arc
*
arc
=
p
;
first_arc
[
n
]
++
;
count_arcs
(
arc
->
target_node
(),
index
,
vmark
);
}
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_arcs */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store_arcs
(
Node
*
node
,
NodeNumbering
&
index
,
VType
vmark
)
{
if
(
!
node
->
was_visited
(
vmark
))
{
unsigned
int
n
=
first_arc
[
index
[
node
]];
Arcs
*
arcs
=
node
->
arcs
();
for
(
ArcsIter
p
(
arcs
);
p
;
p
++
)
{
Arc
*
arc
=
p
;
label
[
n
]
=
arc
->
label
();
target_node
[
n
++
]
=
index
[
arc
->
target_node
()];
store_arcs
(
arc
->
target_node
(),
index
,
vmark
);
}
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::MakeCompactTransducer */
/* */
/*******************************************************************/
MakeCompactTransducer
::
MakeCompactTransducer
(
Transducer
&
a
,
Level
l
)
{
if
(
a
.
is_infinitely_ambiguous
())
{
std
::
cerr
<<
"Error: resulting transducer contains an infinite loop!
\n
"
;
exit
(
1
);
}
NodeNumbering
index
(
a
);
alphabet
.
copy
(
a
.
alphabet
);
// memory allocation
number_of_nodes
=
(
unsigned
)
index
.
number_of_nodes
();
finalp
=
new
char
[
number_of_nodes
];
first_arc
=
new
unsigned
int
[
number_of_nodes
+
1
];
// count the number of outgoing arcs for each node
// and store them in first_arc[]
a
.
incr_vmark
();
count_arcs
(
a
.
root_node
(),
index
,
a
.
vmark
);
for
(
int
n
=
number_of_nodes
;
n
>
0
;
n
--
)
first_arc
[
n
]
=
first_arc
[
n
-
1
];
first_arc
[
0
]
=
0
;
for
(
unsigned
int
n
=
0
;
n
<
number_of_nodes
;
n
++
)
first_arc
[
n
+
1
]
+=
first_arc
[
n
];
number_of_arcs
=
first_arc
[
number_of_nodes
];
// memory allocation
label
=
new
Label
[
number_of_arcs
];
target_node
=
new
unsigned
int
[
number_of_arcs
];
// store the arcs
a
.
incr_vmark
();
store_arcs
(
a
.
root_node
(),
index
,
a
.
vmark
);
// sort the arcs
sort
(
l
);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_finalp */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store_finalp
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
char
n
=
0
;
for
(
size_t
i
=
0
;
i
<
number_of_nodes
;
i
++
)
{
n
=
(
unsigned
char
)(
n
<<
1
);
if
(
finalp
[
i
])
n
|=
1
;
if
(
++
k
==
8
)
{
fputc
(
n
,
file
);
n
=
0
;
k
=
0
;
}
}
if
(
k
>
0
)
{
n
=
(
unsigned
char
)(
n
<<
(
8
-
k
));
fputc
(
n
,
file
);
}
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_first_arcs */
/* */
/* The data is encoded with the minimal number of bits needed. */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store_first_arcs
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
int
n
=
0
;
// compute number of bits required for storing each item
int
bits
=
(
int
)
ceil
(
log
(
number_of_arcs
+
1
)
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<=
number_of_nodes
;
i
++
)
{
unsigned
int
m
=
first_arc
[
i
];
m
<<=
(
sizeof
(
n
)
*
8
)
-
bits
;
m
>>=
k
;
n
=
n
|
m
;
k
+=
bits
;
if
(
k
>=
(
int
)
sizeof
(
n
)
*
8
)
{
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
k
-=
(
int
)
sizeof
(
n
)
*
8
;
n
=
first_arc
[
i
];
if
(
k
==
0
)
n
=
0
;
else
n
=
first_arc
[
i
]
<<
(
sizeof
(
n
)
*
8
-
k
);
}
}
if
(
k
>
0
)
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_target_nodes */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store_target_nodes
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
int
n
=
0
;
int
bits
=
(
int
)
ceil
(
log
(
number_of_nodes
)
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<
number_of_arcs
;
i
++
)
{
unsigned
int
m
=
target_node
[
i
];
m
<<=
(
sizeof
(
n
)
*
8
)
-
bits
;
m
>>=
k
;
n
=
n
|
m
;
k
+=
bits
;
if
(
k
>=
(
int
)
sizeof
(
n
)
*
8
)
{
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
k
-=
(
int
)
sizeof
(
n
)
*
8
;
if
(
k
==
0
)
n
=
0
;
else
n
=
target_node
[
i
]
<<
(
sizeof
(
n
)
*
8
-
k
);
}
}
if
(
k
>
0
)
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store_labels */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store_labels
(
FILE
*
file
)
{
size_t
N
=
0
;
LabelNumber
LNum
;
for
(
Alphabet
::
const_iterator
it
=
alphabet
.
begin
();
it
!=
alphabet
.
end
();
it
++
)
{
Label
l
=*
it
;
LNum
[
l
]
=
N
++
;
}
int
k
=
0
;
unsigned
int
n
=
0
;
int
bits
=
(
int
)
ceil
(
log
((
double
)
alphabet
.
size
())
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<
number_of_arcs
;
i
++
)
{
unsigned
int
l
=
(
unsigned
)
LNum
[
label
[
i
]];
unsigned
int
m
=
l
;
m
<<=
(
sizeof
(
n
)
*
8
)
-
bits
;
m
>>=
k
;
n
=
n
|
m
;
k
+=
bits
;
if
(
k
>=
(
int
)
sizeof
(
n
)
*
8
)
{
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
k
-=
(
int
)
sizeof
(
n
)
*
8
;
if
(
k
==
0
)
n
=
0
;
else
n
=
l
<<
(
sizeof
(
n
)
*
8
-
k
);
}
}
if
(
k
>
0
)
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
}
/*******************************************************************/
/* */
/* MakeCompactTransducer::store */
/* */
/*******************************************************************/
void
MakeCompactTransducer
::
store
(
FILE
*
file
)
{
fputc
(
'c'
,
file
);
alphabet
.
store
(
file
);
fwrite
(
&
number_of_nodes
,
sizeof
(
number_of_nodes
),
1
,
file
);
fwrite
(
&
number_of_arcs
,
sizeof
(
number_of_arcs
),
1
,
file
);
store_finalp
(
file
);
store_first_arcs
(
file
);
store_labels
(
file
);
store_target_nodes
(
file
);
if
(
ferror
(
file
))
throw
"Error encountered while writing transducer to file
\n
"
;
}
}
src/main/resources/SMOR/src/make-compact.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE make-compact.h */
/* MODULE make-compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/*******************************************************************/
#ifndef _MAKE_COMPACT_H_
#define _MAKE_COMPACT_H_
#include "fst.h"
#include "compact.h"
namespace
SFST
{
class
MakeCompactTransducer
:
CompactTransducer
{
private:
void
count_arcs
(
Node
*
node
,
NodeNumbering
&
index
,
VType
vmark
);
void
store_arcs
(
Node
*
node
,
NodeNumbering
&
index
,
VType
vmark
);
void
store_finalp
(
FILE
*
file
);
void
store_first_arcs
(
FILE
*
file
);
void
store_target_nodes
(
FILE
*
file
);
void
store_labels
(
FILE
*
file
);
public:
MakeCompactTransducer
(
Transducer
&
a
,
Level
sort
=
upper
);
void
sort
(
Level
);
void
store
(
FILE
*
file
);
};
}
#endif
src/main/resources/SMOR/src/make-compact.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/man1/fst-compact.1
0 → 100755
View file @
0b611765
.TH fst-compact 1 "October 2004" "" "fst-compact"
.SH NAME
fst-compact \- transforms transducers to the compact format
.SH SYNOPSIS
.B fst-compact
.I [ file [ file ] ]
.SH DESCRIPTION
.I fst-compact
reads a transducer in standard format from the input and writes it in
compact format to the output. The compact format is required by
.I fst-infl2.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler, fst-infl3
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
src/main/resources/SMOR/src/man1/fst-compare.1
0 → 100755
View file @
0b611765
.TH fst-compare 1 "February 2002" "" "fst-compare"
.SH NAME
fst-compare \- checks whether two transducers are equivalent
.SH SYNOPSIS
.B fst-compare
.I file1 file2
.SH DESCRIPTION
.I fst-compare
checks whether the two transducers stored in
.I file1
and
.I file2
are equivalent.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
src/main/resources/SMOR/src/man1/fst-compiler-utf8.1
0 → 100755
View file @
0b611765
.TH fst-compiler 1 "December 2004" "" "fst-compiler"
.SH NAME
fst-compiler fst-compiler-utf8 \- Two compilers for SFST programs
.SH SYNOPSIS
.B fst-compiler
.I grammar-file
[
.I output-file
]
.br
.B fst-compiler-utf8
.I grammar-file
[
.I output-file
]
.SH OPTIONS
.TP
.B \-c
Store the transducer in compact format which is used by fst-infl2.
.TP
.B \-l
Store the transducer in lowmem format.
.TP
.B \-s
Switch surface and analysis layer of the transducer. You have to use
this switch in order to use
.I fst-infl (fst-infl2, fst-infl3)
for generation rather than analysis.
.SH DESCRIPTION
.B fst-compiler
is a compiler for finite-state transducer programs. It generates a
minimized finite state transducer which can be used with
.I fst-mor,
.I fst-infl,
.I fst-print,
.I fst-compare,
.I fst-parse,
and
.I fst-lattice.
The compact transducer representation which is generated with the -c
flag, is supported by
.I fst-infl2,
.I fst-train,
and
.I fst-match.
The memory-efficient transducer representation which is generated with
the -l flag, is only supported by
.I fst-infl3.
.PP
The first program argument is the name of a file which contains the
transducer program. The programming language is described below. The
second argument is the name of the file to which the resulting
transducer will be written in binary form. If a second argument is
missing, the output will be written to
.I stdout.
.PP
.I fst-compiler-utf8
differs from
.I fst-compiler
only in the character encoding.
.I fst-compiler-utf8
supports UTF8 encoding of the source files whereas
.I fst-compiler
is to be used for 8-Bit character codes like latin1 which are an
extension of the ASCII code. Information about the encoding is stored
in the transducer files and used by the other SFST programs.
.SH "FILE FORMATS"
A transducer program consists of an (optional) sequence of
.I alphabet
and
.I variable
definitions followed by a single
.I transducer expression
which defines the result transducer.
.PP
.SM Alphabet
.PP
An alphabet definition consists of the keyword ALPHABET followed by
= and some transducer expression e.g.
.TP
ALPHABET = [a-z]:[A-Z]
.PP
This command redefines the alphabet as the set of symbol pairs
occurring on the transitions of the transducer. Occurrences of
two-level operators, negation operators and unquoted periods always
have to be preceded by an alphabet definition.
.PP
.SM Variables
.PP
There are two different types of variables.
.I Symbol set variables
are enclosed by hash signs (#) and take symbol sequences (see below)
as values:
.TP 0
#UC# = A-Z
#LC# = a-z
.PP
.I Transducer variables
are enclosed by dollar signs and take transducer expressions as
values:
.TP 0
$MAP$ = [a-z]:[A-Z]+
$MAP$ = [#LC#]:[#UC#]+
.PP
Variables whose name starts with the symbol `=' are special
.I agreement
variables. If an agreement variable occurs more than once in a
transducer expression, it will always have the same value. Consider
the following transducer program:
.TP 0
$=1$ = [abc]
$=1$ X $=1$
.PP
The result transducer recognizes the strings aXa, bXb, and cXc. Only
acyclic transducers (i.e. transducers with a finite set of string
mappings) can be assigned to agreement variables.
.PP
.SM Symbols
.PP
A symbol is either
.PP
- a single character like A s 5,
.PP
- a quoted character like \\* or \\_,
.TP 2
- a multi-character symbol like <X> or <ab.c5> (which is always
enclosed in angle brackets) or
.TP
- a backslash followed by a number which is the numeric code of the
designated character
.PP
- the null symbol <>.
.PP
.SM Symbol sequence
.PP
A symbol sequence is a sequence of characters, multi-character symbols
and character ranges, e.g. a-z \\. <x>.
.PP
.SM symbol range
.PP
A symbol range is either
.PP
- a single symbol
.PP
- a symbol sequence enclosed in square brackets like [A-Za-z] or
.PP
- a symbol sequence starting with ^ and enclosed in square brackets
like [^A-Za-z] (designating the complement of [a-zA-Z]) or
.PP
- the period (which represents any symbol from the alphabet)
.PP
.SM Transducer expressions
.PP
A transducer expression (TE) is recursively defined as follows:
.TP 2
- A pair of two symbol ranges separated by a colon is a TE.
[a-z]:[a-Z]
.TP 1
- A single symbol range like [a-z] is a TE.
.BR
It is a short form for [a-z]:[a-z].
.TP 1
- Two symbol sequences enclosed in braces and separated by a colon are
a TE. {a[bc]}:{def} is equivalent to a:d b:e <>:f | a:d c:e <>:f.
.TP 1
- X Y is a TE if X and Y are TEs.
.BR
(Blanks are ignored unless they are quoted.)
.TP 1
- (X) is a TE if X is a TE.
.TP 1
- X op is a TE is X is a TE and op is either * (Kleene's star operator), +
(Kleene's plus operator), or ? (optionality operator)
.TP 1
- op X is a TE is X is a TE and op is either ! (negation operator), ^
(target language extraction operator), _ (source language extraction
operator), or ^_ (source and target switch operator).
.TP 1
- X op Y is a TE is X and Y are TEs and op is either & (conjunction
operator), | (disjunction operator), || (composition operator), or -
(subtraction operator)
.TP 1
- L x op y R is a TE if L and R are TEs, x and y are symbol ranges and
op is either => (two-level restriction), <= (two-level coercion), or
<=> (two-level restriction and coercion).
.TP 1
- X op L__R is a TE if X, L and R are TEs and op is either ^-> (upward
replacement), _-> (downward replacement), /-> (leftward replacement)
or \\-> (rightward replacement). Furthermore, L and R must define
automata (i.e. which map their strings onto themselves). These
operators correspond to Karttunen's replace operators. If the arrow is
followed by a question mark (?), the replacement becomes optional.
.TP 1
- X << l is a TE if X is a TE, and l is either of the form
a or the form a:b where a and b are single characters or symbols. The
result is a transducer where l was freely inserted into X. The
transducer ab << c for instance is equivalent to c*ac*bc*.
.TP 1
- X op Y L1__R2, ... , LN__RN is a TE if X,Y, L1 through LN and R1
through RN are TEs, and op is either => (general restriction), <=
(general coercion), ^=> (general surface restriction), ^<= (general
surface coercion), ^<=> (general surface restriction and coercion),
_=> (general deep restriction), _<= (general deep coercion), _<=>
(general deep restriction and coercion). (These operators were
implemented following a suggestion by Anssi Yli-Jyra.)
.TP 1
- "fname" is a TE. The compiler reads the file named fname and turns
it into a transducer of the form line1|line2|line3|... where linex is
the x-th line of the file. All characters other than : and \\ are
interpreted literally (i.e. not as operators). This TE is typically
used e.g. to read morpheme list from a file.
.TP 1
- "<fname>" is a TE. The compiler reads a pre-compiled transducer from
the file named fname. This
.PP
Further Features
.PP
Comments start with the symbol % and extend up to the end of the line.
Blanks are ignored unless they are quoted. Expressions terminate at
the end of a line unless the end of line is preceded by a backslash.
The command
.TP
#include "fname"
.PP
can be used to insert source code from a
file named fname.
The command
.TP
RE >> "fname"
.PP
stores the regular expression RE in the file fname.
The command
.TP
#use hopcroft
.PP
tells the compiler to use the Hopcroft minimisation algorithm from now on, and
.TP
#use default
.PP
switsches back to the default minimisation algorithm (Brzozowski).
The command
.SH EXAMPLE
Here is an example of a simple transducer program. Assuming that
the file "adj-stems" contains the two lines
.PP
.ti +3
easy
.ti +3
late
.ti +3
big
.PP
this transducer will correctly analyze the adjective forms easy,
easier, easiest and late, later, and latest.
.PP
ALPHABET = [a-zA-Z] y:i e:<> <ADJ>:<>
$R$ = y<=>i (<ADJ>:<> e)
$R2$ = e<=><> (<ADJ>:<> e)
$R$ = $R$ & $R2$
$Stems$ = "adj-stems"
$S$ = $Stems$ <ADJ> (<pos>:<>|<cmp>:{er}|<sup>:{est})
$S$ || $R$
.SH "EXIT STATUS"
.B fst-compiler
returns 0 unless some error occurs.
.\" .SH FILES
.SH BUGS
The compiler gets the operator precedence wrong in case of two-level
rules and interprets the expression "ab c<=>d ef" as "a(b c<=>d
(ef))". Therefore, you should always surround the left context of
two-level rules with parenthesis: (ab) c<=>d (ef)
.SH "SEE ALSO"
fst-mor, fst-infl, fst-infl2, fst-infl3, fst-print, fst-compact,
fst-parse, fst-compare, fst-compact, fst-lowmem, fst-lattice, fst-train
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
src/main/resources/SMOR/src/man1/fst-compiler.1
0 → 100755
View file @
0b611765
.TH fst-compiler 1 "December 2004" "" "fst-compiler"
.SH NAME
fst-compiler fst-compiler-utf8 \- Two compilers for SFST programs
.SH SYNOPSIS
.B fst-compiler
.I grammar-file
[
.I output-file
]
.br
.B fst-compiler-utf8
.I grammar-file
[
.I output-file
]
.SH OPTIONS
.TP
.B \-c
Store the transducer in compact format which is used by fst-infl2.
.TP
.B \-l
Store the transducer in lowmem format.
.TP
.B \-s
Switch surface and analysis layer of the transducer. You have to use
this switch in order to use
.I fst-infl (fst-infl2, fst-infl3)
for generation rather than analysis.
.SH DESCRIPTION
.B fst-compiler
is a compiler for finite-state transducer programs. It generates a
minimized finite state transducer which can be used with
.I fst-mor,
.I fst-infl,
.I fst-print,
.I fst-compare,
.I fst-parse,
and
.I fst-lattice.
The compact transducer representation which is generated with the -c
flag, is supported by
.I fst-infl2,
.I fst-train,
and
.I fst-match.
The memory-efficient transducer representation which is generated with
the -l flag, is only supported by
.I fst-infl3.
.PP
The first program argument is the name of a file which contains the
transducer program. The programming language is described below. The
second argument is the name of the file to which the resulting
transducer will be written in binary form. If a second argument is
missing, the output will be written to
.I stdout.
.PP
.I fst-compiler-utf8
differs from
.I fst-compiler
only in the character encoding.
.I fst-compiler-utf8
supports UTF8 encoding of the source files whereas
.I fst-compiler
is to be used for 8-Bit character codes like latin1 which are an
extension of the ASCII code. Information about the encoding is stored
in the transducer files and used by the other SFST programs.
.SH "FILE FORMATS"
A transducer program consists of an (optional) sequence of
.I alphabet
and
.I variable
definitions followed by a single
.I transducer expression
which defines the result transducer.
.PP
.SM Alphabet
.PP
An alphabet definition consists of the keyword ALPHABET followed by
= and some transducer expression e.g.
.TP
ALPHABET = [a-z]:[A-Z]
.PP
This command redefines the alphabet as the set of symbol pairs
occurring on the transitions of the transducer. Occurrences of
two-level operators, negation operators and unquoted periods always
have to be preceded by an alphabet definition.
.PP
.SM Variables
.PP
There are two different types of variables.
.I Symbol set variables
are enclosed by hash signs (#) and take symbol sequences (see below)
as values:
.TP 0
#UC# = A-Z
#LC# = a-z
.PP
.I Transducer variables
are enclosed by dollar signs and take transducer expressions as
values:
.TP 0
$MAP$ = [a-z]:[A-Z]+
$MAP$ = [#LC#]:[#UC#]+
.PP
Variables whose name starts with the symbol `=' are special
.I agreement
variables. If an agreement variable occurs more than once in a
transducer expression, it will always have the same value. Consider
the following transducer program:
.TP 0
$=1$ = [abc]
$=1$ X $=1$
.PP
The result transducer recognizes the strings aXa, bXb, and cXc. Only
acyclic transducers (i.e. transducers with a finite set of string
mappings) can be assigned to agreement variables.
.PP
.SM Symbols
.PP
A symbol is either
.PP
- a single character like A s 5,
.PP
- a quoted character like \\* or \\_,
.TP 2
- a multi-character symbol like <X> or <ab.c5> (which is always
enclosed in angle brackets) or
.TP
- a backslash followed by a number which is the numeric code of the
designated character
.PP
- the null symbol <>.
.PP
.SM Symbol sequence
.PP
A symbol sequence is a sequence of characters, multi-character symbols
and character ranges, e.g. a-z \\. <x>.
.PP
.SM symbol range
.PP
A symbol range is either
.PP
- a single symbol
.PP
- a symbol sequence enclosed in square brackets like [A-Za-z] or
.PP
- a symbol sequence starting with ^ and enclosed in square brackets
like [^A-Za-z] (designating the complement of [a-zA-Z]) or
.PP
- the period (which represents any symbol from the alphabet)
.PP
.SM Transducer expressions
.PP
A transducer expression (TE) is recursively defined as follows:
.TP 2
- A pair of two symbol ranges separated by a colon is a TE.
[a-z]:[a-Z]
.TP 1
- A single symbol range like [a-z] is a TE.
.BR
It is a short form for [a-z]:[a-z].
.TP 1
- Two symbol sequences enclosed in braces and separated by a colon are
a TE. {a[bc]}:{def} is equivalent to a:d b:e <>:f | a:d c:e <>:f.
.TP 1
- X Y is a TE if X and Y are TEs.
.BR
(Blanks are ignored unless they are quoted.)
.TP 1
- (X) is a TE if X is a TE.
.TP 1
- X op is a TE is X is a TE and op is either * (Kleene's star operator), +
(Kleene's plus operator), or ? (optionality operator)
.TP 1
- op X is a TE is X is a TE and op is either ! (negation operator), ^
(target language extraction operator), _ (source language extraction
operator), or ^_ (source and target switch operator).
.TP 1
- X op Y is a TE is X and Y are TEs and op is either & (conjunction
operator), | (disjunction operator), || (composition operator), or -
(subtraction operator)
.TP 1
- L x op y R is a TE if L and R are TEs, x and y are symbol ranges and
op is either => (two-level restriction), <= (two-level coercion), or
<=> (two-level restriction and coercion).
.TP 1
- X op L__R is a TE if X, L and R are TEs and op is either ^-> (upward
replacement), _-> (downward replacement), /-> (leftward replacement)
or \\-> (rightward replacement). Furthermore, L and R must define
automata (i.e. which map their strings onto themselves). These
operators correspond to Karttunen's replace operators. If the arrow is
followed by a question mark (?), the replacement becomes optional.
.TP 1
- X << l is a TE if X is a TE, and l is either of the form
a or the form a:b where a and b are single characters or symbols. The
result is a transducer where l was freely inserted into X. The
transducer ab << c for instance is equivalent to c*ac*bc*.
.TP 1
- X op Y L1__R2, ... , LN__RN is a TE if X,Y, L1 through LN and R1
through RN are TEs, and op is either => (general restriction), <=
(general coercion), ^=> (general surface restriction), ^<= (general
surface coercion), ^<=> (general surface restriction and coercion),
_=> (general deep restriction), _<= (general deep coercion), _<=>
(general deep restriction and coercion). (These operators were
implemented following a suggestion by Anssi Yli-Jyra.)
.TP 1
- "fname" is a TE. The compiler reads the file named fname and turns
it into a transducer of the form line1|line2|line3|... where linex is
the x-th line of the file. All characters other than : and \\ are
interpreted literally (i.e. not as operators). This TE is typically
used e.g. to read morpheme list from a file.
.TP 1
- "<fname>" is a TE. The compiler reads a pre-compiled transducer from
the file named fname. This
.PP
Further Features
.PP
Comments start with the symbol % and extend up to the end of the line.
Blanks are ignored unless they are quoted. Expressions terminate at
the end of a line unless the end of line is preceded by a backslash.
The command
.TP
#include "fname"
.PP
can be used to insert source code from a
file named fname.
The command
.TP
RE >> "fname"
.PP
stores the regular expression RE in the file fname.
The command
.TP
#use hopcroft
.PP
tells the compiler to use the Hopcroft minimisation algorithm from now on, and
.TP
#use default
.PP
switsches back to the default minimisation algorithm (Brzozowski).
The command
.SH EXAMPLE
Here is an example of a simple transducer program. Assuming that
the file "adj-stems" contains the two lines
.PP
.ti +3
easy
.ti +3
late
.ti +3
big
.PP
this transducer will correctly analyze the adjective forms easy,
easier, easiest and late, later, and latest.
.PP
ALPHABET = [a-zA-Z] y:i e:<> <ADJ>:<>
$R$ = y<=>i (<ADJ>:<> e)
$R2$ = e<=><> (<ADJ>:<> e)
$R$ = $R$ & $R2$
$Stems$ = "adj-stems"
$S$ = $Stems$ <ADJ> (<pos>:<>|<cmp>:{er}|<sup>:{est})
$S$ || $R$
.SH "EXIT STATUS"
.B fst-compiler
returns 0 unless some error occurs.
.\" .SH FILES
.SH BUGS
The compiler gets the operator precedence wrong in case of two-level
rules and interprets the expression "ab c<=>d ef" as "a(b c<=>d
(ef))". Therefore, you should always surround the left context of
two-level rules with parenthesis: (ab) c<=>d (ef)
.SH "SEE ALSO"
fst-mor, fst-infl, fst-infl2, fst-infl3, fst-print, fst-compact,
fst-parse, fst-compare, fst-compact, fst-lowmem, fst-lattice, fst-train
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
src/main/resources/SMOR/src/man1/fst-generate.1
0 → 100755
View file @
0b611765
.TH fst-generate 1 "October 2003" "" "fst-generate"
.SH NAME
fst-generate \- randomly generate strings of character pairs from a transducer
.SH SYNOPSIS
.B fst-generate
.I file1 [ file [ file ] ]
.SH OPTIONS
.TP
.B \-s
print the surface characters only
.TP
.B \-a
print the analysis characters only
.TP
.B \-h
print usage information.
.TP
.B \-q
quiet mode
.TP
.B \-d
debugging mode
.SH DESCRIPTION
.I fst-generate
randomly generate sequences of character pairs which are accepted by
the transducer.
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-parse
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
src/main/resources/SMOR/src/man1/fst-infl.1
0 → 100755
View file @
0b611765
.TH fst-infl 1 "November 2004" "" "fst-infl"
.SH NAME
fst-infl fst-infl2 fst-infl3 \- morphological analysers
.SH SYNOPSIS
.B fst-infl [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.br
.B fst-infl2 [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.br
.B fst-infl3 [ options ]
.I file
[
.I input-file
[
.I output-file
]
]
.SH OPTIONS
.TP
.B \-t file
Read an alternative transducer from
.I file
and use it if the main transducer fails to find an analysis. By
iterating this option, a cascade of transducers may be tried to find
an analysis.
.TP
.B \-b
Print surface and analysis symbols. (fst-infl2 only)
.TP
.B \-n
Print multi-character symbols without the enclosing angle brackets.
(fst-infl only)
.TP
.B \-d
The analyses are symbolically disambiguated by returning only analyses
with a minimal number of morphemes. This option requires that morpheme
boundaries are marked with the tag <X>. If no <X> tag is found in the
analysis string, then the program (basically) counts the number of
multi-character symbols consisting entirely of upper-case characters
and uses this count for disambiguation. The latter heuristic was
developed for the German SMOR morphology. (This option is only
available with fst-infl2 and fst-infl3.)
.TP
.B \-e n
If no regular analysis is found, do robust matching and print analyses
with up to
.I n
edit errors. The set of edit operations currently includes
replacement, insertion and deletion. Each operation has currently a
fixed error weight of 1. (fst-infl2 only)
.TP
.B \-% f
Disambiguates the analyses statistically and prints the most likely
analyses with at least f % of the total probability mass of the
analyses. The transducer weights are read from a file obtained by
appending
.I .prob
to the name of the transducer file. The weight files are created with
.I fst-train.
(fst-infl2 only)
.TP
.B \-p
Print the probability of each analysis. (fst-infl2 only)
.TP
.B \-c
use this option if the transducer was compiled on a computer with a
different endianness. If you have a transducer which was compiled
on a Sparc computer and you want to use it on a Pentium, you need to
use this option. (fst-infl2 only)
.TP
.B \-q
Suppress status messages.
.TP
.B \-h
Print usage information.
.SH DESCRIPTION
.I fst-infl
is a morphological analyser. The first argument is the name of a file
which was generated by
.I fst-compiler.
The second argument is the name of the input file. The third argument
is the output file. If the third argument is missing, output is
directed to
.I stdout.
If the second argument is missing, as well, input is read from
.I stdin.
.I fst-infl2
is similar to
.I fst-infl
but needs a transducer in compact format (see the man pages for
.I fst-compiler and fst-compact). fst-infl2 is implemented differently
from fst-infl and usually much faster.
.I fst-infl3
is also similar to
.I fst-infl
but needs a transducer in lowmem format (see the man pages for
.I fst-compiler and fst-lowmem). fst-infl3 accesses the transducer on
disc rather than reading it into memory. It starts very fast and needs
very little memory, but is slower than fst-infl2.
.I fst-infl
reads the transducer which is stored in the argument file. Then it
reads the input file line by line. Each line is analysed with the
transducer and all resulting analyses are printed (see also the man
pages for
.I fst-mor).
.SH BUGS
No bugs are known so far.
.SH "SEE ALSO"
fst-compiler, fst-mor
.SH AUTHOR
Helmut Schmid,
Institute for Computational Linguistics,
University of Stuttgart,
Email: schmid@ims.uni-stuttgart.de,
This software is available under the GNU Public License.
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment