Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
germanet
germanet-decompounder
Commits
0b611765
Commit
0b611765
authored
Jan 23, 2019
by
Ben Campbell
Browse files
Initial commit
parents
Changes
156
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2418 additions
and
0 deletions
+2418
-0
src/main/resources/ASV_trees/kompVVic.tree
src/main/resources/ASV_trees/kompVVic.tree
+0
-0
src/main/resources/SMOR/.DS_Store
src/main/resources/SMOR/.DS_Store
+0
-0
src/main/resources/SMOR/lib/smor-ascii.ca
src/main/resources/SMOR/lib/smor-ascii.ca
+0
-0
src/main/resources/SMOR/lib/smor-cap.ca
src/main/resources/SMOR/lib/smor-cap.ca
+0
-0
src/main/resources/SMOR/lib/smor-guesser.ca
src/main/resources/SMOR/lib/smor-guesser.ca
+0
-0
src/main/resources/SMOR/lib/smor-ss.ca
src/main/resources/SMOR/lib/smor-ss.ca
+0
-0
src/main/resources/SMOR/lib/smor-uc.ca
src/main/resources/SMOR/lib/smor-uc.ca
+0
-0
src/main/resources/SMOR/lib/smor.ca
src/main/resources/SMOR/lib/smor.ca
+0
-0
src/main/resources/SMOR/src/Makefile
src/main/resources/SMOR/src/Makefile
+211
-0
src/main/resources/SMOR/src/Socket.h
src/main/resources/SMOR/src/Socket.h
+61
-0
src/main/resources/SMOR/src/Transducer.h
src/main/resources/SMOR/src/Transducer.h
+102
-0
src/main/resources/SMOR/src/TransducerTest.C
src/main/resources/SMOR/src/TransducerTest.C
+43
-0
src/main/resources/SMOR/src/alphabet.C
src/main/resources/SMOR/src/alphabet.C
+864
-0
src/main/resources/SMOR/src/alphabet.h
src/main/resources/SMOR/src/alphabet.h
+296
-0
src/main/resources/SMOR/src/alphabet.o
src/main/resources/SMOR/src/alphabet.o
+0
-0
src/main/resources/SMOR/src/basic.C
src/main/resources/SMOR/src/basic.C
+85
-0
src/main/resources/SMOR/src/basic.h
src/main/resources/SMOR/src/basic.h
+27
-0
src/main/resources/SMOR/src/basic.o
src/main/resources/SMOR/src/basic.o
+0
-0
src/main/resources/SMOR/src/compact.C
src/main/resources/SMOR/src/compact.C
+629
-0
src/main/resources/SMOR/src/compact.h
src/main/resources/SMOR/src/compact.h
+100
-0
No files found.
src/main/resources/ASV_trees/kompVVic.tree
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/.DS_Store
0 → 100644
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor-ascii.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor-cap.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor-guesser.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor-ss.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor-uc.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/lib/smor.ca
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/Makefile
0 → 100755
View file @
0b611765
# TYPE is either debug, prof, static, lib, Sun64bit or the empty string
TYPE
=
# try different definitions of SGILIB if hash_map is not found
#SGILIB=
SGILIB
=
-DSGI__gnu_cxx
#SGILIB=-DSGIext
# comment the following line if readline is not found
#READLINE = -DREADLINE
#LREADLINE = -lreadline -lhistory -lncurses
# data type used for the encoding of the symbols (char, short, or int)
# short is the default
#CDT=-DCODE_DATA_TYPE=short
CXX
=
g++
WARNING
=
-Wall
-Wcast-qual
-Wconversion
-std
=
c++98
-Wno-deprecated
-ansi
ifeq
($(TYPE),debug)
DEBUG
=
-ggdb
OPTIMIZATION
=
-O0
else
DEBUG
=
-DNDEBUG
ifeq
($(TYPE),prof)
OPTIMIZATION
=
-O2
PROFILE
=
-pg
else
OPTIMIZATION
=
-O3
ifeq
($(TYPE),static)
STATIC
=
-static
else
ifeq
($(TYPE),Sun64bit)
LOPT
=
-L
/usr/lib/64
-L
/usr/local/lib/sparcv9
-R
/usr/lib/64
-R
/usr/local/lib/sparcv9
CXX
=
g++64
BITS
=
-mcpu
=
v9
else
ifeq
($(TYPE),lib)
LIB
=
-fPIC
endif
endif
endif
endif
endif
CC
=
g++
CL
=
$(CC)
CFLAGS
=
$(DEBUG)
$(PROFILE)
$(LIB)
$(OPTIMIZATION)
$(WARNING)
$(SGILIB)
$(CDT)
$(BITS)
$(READLINE)
LDFLAGS
=
$(DEBUG)
$(PROFILE)
$(STATIC)
$(LOPT)
CXXFLAGS
=
$(CFLAGS)
LXXFLAGS
=
$(LDFLAGS)
INSTALL
=
install
INSTALL_MANPAGES
=
install
-m
644
INSTALL_DIR
=
install
-d
-m
755
DESTDIR
=
PREFIX
=
/usr/local/
PROGRAMS
=
fst-compiler fst-infl fst-generate fst-print fst-compare
\
fst-compact fst-infl2 fst-lowmem fst-infl3 fst-lattice fst-match
\
fst-parse fst-parse2 fst-train fst-compiler-utf8 fst-text2bin fst-mor
ALLPROGRAMS
=
$(PROGRAMS)
ifeq
($(TYPE),lib)
TARGET
=
libsfst.tar.gz
else
ifeq
($(TYPE),debug)
TARGET
=
$(ALLPROGRAMS)
else
TARGET
=
strip
endif
endif
OBJ
=
basic.o utf8.o alphabet.o fst.o operators.o determinise.o hopcroft.o
OBJ2
=
$(OBJ)
compact.o make-compact.o interface.o
.PHONY
:
all clean realclean archive remake install strip
all
:
$(TARGET)
strip
:
$(ALLPROGRAMS)
strip
$(ALLPROGRAMS)
fst-api.zip
:
test.C alphabet.C alphabet.h basic.C basic.h compact.C
\
compact.h lowmem.C lowmem.h robust.C
zip
$@
$^
fst-compact
:
compact.o make-compact.o
fst-generate
:
generate.o
fst-match
:
compact.o
fst-train
:
compact.o
fst-compiler
:
$(OBJ2) default-scanner.o
fst-compiler-utf8
:
$(OBJ2) utf8-scanner.o fst-compiler.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
fst-mor
:
$(OBJ) fst-mor.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
$(LREADLINE)
fst-text2bin
:
$(OBJ) fst-text2bin.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
%
:
$(OBJ) %.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
fst-infl2
:
alphabet.o basic.o compact.o robust.o utf8.o fst-infl2.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
fst-infl2-daemon
:
alphabet.o basic.o compact.o robust.o utf8.o fst-infl2-daemon.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
fst-infl3
:
alphabet.o basic.o utf8.o lowmem.o fst-infl3.o
$(CXX)
-o
$@
$^
$(LDFLAGS)
test
:
test.o
$(CL)
-L
.
-lsfst
-o
$@
$^
$(LDFLAGS)
fst-compiler.C
:
fst.h compact.h make-compact.h interface.h fst-compiler.yy
bison
-d
-o
$@
fst-compiler.yy
if
[
-f
fst-compiler.C.h
]
;
then
mv
fst-compiler.C.h fst-compiler.h
;
fi
if
[
-f
fst-compiler.H
]
;
then
mv
fst-compiler.H fst-compiler.h
;
fi
default-scanner.C
:
fst.h interface.h scanner.ll fst-compiler.C default-scanner.ll
flex
-o
$@
default-scanner.ll
utf8-scanner.C
:
fst.h interface.h scanner.ll fst-compiler.C utf8-scanner.ll
flex
-o
$@
utf8-scanner.ll
default-scanner.ll
:
scanner.ll
sed
-e
'/^utf8>/d'
-e
's/^default> *//'
scanner.ll
>
default-scanner.ll
utf8-scanner.ll
:
scanner.ll
sed
-e
'/^default>/d'
-e
's/^utf8> *//'
scanner.ll
>
utf8-scanner.ll
libsfst.tar.gz
:
libsfst.so compact.h alphabet.h basic.h fst.h
tar
-zcf
$@
$^
libsfst.so
:
alphabet.o basic.o utf8.o compact.o fst.o operators.o determinise.o generate.o hopcroft.o
$(CL)
-shared
-o
$@
$^
$(LDFLAGS)
remake
:
clean $(ALLPROGRAMS)
clean
:
-
rm
-f
*
.o
*
~ Makefile.bak y.tab.h man1/
*
~ 2>&-
>
/dev/null
realclean
:
clean
-
rm
-f
$(ALLPROGRAMS)
test
2>&-
>
/dev/null
install
:
$(PROGRAMS)
# $(INSTALL_DIR) $(DESTDIR)$(PREFIX)bin
for
p
in
$(PROGRAMS);
do
$(INSTALL)
$$p
$(DESTDIR)$(PREFIX)bin/$$p;
done
maninstall
:
$(INSTALL_DIR)
$(DESTDIR)$(PREFIX)
man/man1
for
m
in
man1/
*
.1
;
do
$(INSTALL_MANPAGES)
$$
m
$(DESTDIR)$(PREFIX)
man/
$$
m
;
done
archive
:
tar
-zhcf
VERSION-
`
date
'+%y%m%d'
`
.tar.gz
*
.ll
*
.yy
*
.[Ch] Makefile man1/
*
Makefile
:
*.C *.ll *.yy *.h
-
makedepend
-Y
--
$(CFLAGS)
--
*
.C 2>/dev/null
# DO NOT DELETE
TransducerTest.o
:
Transducer.h
alphabet.o
:
utf8.h alphabet.h basic.h sgi.h
basic.o
:
basic.h
compact.o
:
compact.h alphabet.h basic.h sgi.h
default-scanner.o
:
interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
default-scanner.o
:
fst-compiler.h
determinise.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-compact.o
:
make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
fst-compare.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-compiler.o
:
make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
fst-compiler.o
:
interface.h utf8.h
fst-generate.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-infl.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-infl2-daemon.o
:
Socket.h compact.h alphabet.h basic.h sgi.h
fst-infl2.o
:
compact.h alphabet.h basic.h sgi.h
fst-infl3.o
:
lowmem.h alphabet.h basic.h sgi.h
fst-lattice.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-lowmem.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-match.o
:
compact.h alphabet.h basic.h sgi.h
fst-mor.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-parse.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-parse2.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-print.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-text2bin.o
:
fst.h alphabet.h basic.h sgi.h mem.h
fst-train.o
:
compact.h alphabet.h basic.h sgi.h
fst.o
:
fst.h alphabet.h basic.h sgi.h mem.h
generate.o
:
fst.h alphabet.h basic.h sgi.h mem.h
interface.o
:
interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
lowmem.o
:
lowmem.h alphabet.h basic.h sgi.h
make-compact.o
:
make-compact.h fst.h alphabet.h basic.h sgi.h mem.h compact.h
operators.o
:
fst.h alphabet.h basic.h sgi.h mem.h
robust.o
:
compact.h alphabet.h basic.h sgi.h
test.o
:
compact.h alphabet.h basic.h sgi.h
utf8-scanner.o
:
interface.h utf8.h fst.h alphabet.h basic.h sgi.h mem.h
utf8-scanner.o
:
fst-compiler.h
utf8.o
:
utf8.h
hopcraft.o
:
fst.h alphabet.h basic.h sgi.h mem.h
src/main/resources/SMOR/src/Socket.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* File: Socket.h */
/* Author: Helmut Schmid */
/* Purpose: */
/* Created: Fri Aug 15 14:19:19 2008 */
/* Modified: Wed Sep 29 08:44:43 2010 (schmid) */
/* */
/*******************************************************************/
namespace
SFST
{
/***************** class Socket **********************************/
class
Socket
{
int
portno
;
/* port address */
int
sockfd
;
struct
sockaddr_in
serv_addr
;
struct
sockaddr
cli_addr
;
socklen_t
clilen
;
public:
int
next_client
()
{
return
accept
(
sockfd
,
(
struct
sockaddr
*
)
&
serv_addr
,
&
clilen
);
}
Socket
(
int
port
=
7070
)
:
portno
(
port
)
{
/* create a socket */
sockfd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
);
if
(
sockfd
<
0
)
{
fprintf
(
stderr
,
"ERROR opening socket
\n
"
);
exit
(
1
);
}
/* initialise serv_addr with zeros */
bzero
((
char
*
)
&
serv_addr
,
sizeof
(
serv_addr
));
serv_addr
.
sin_family
=
AF_INET
;
/* convert portno to network byte order and */
/* store it in serv_addr.sin_port */
serv_addr
.
sin_port
=
htons
(
portno
);
/* set the host IP address (available in INADDR_ANY) */
serv_addr
.
sin_addr
.
s_addr
=
INADDR_ANY
;
/* bind the socket to a host and port */
if
(
bind
(
sockfd
,
(
struct
sockaddr
*
)
&
serv_addr
,
sizeof
(
serv_addr
))
<
0
)
{
fprintf
(
stderr
,
"ERROR on binding
\n
"
);
exit
(
1
);
}
/* Listen to the socket; up to 5 connections at a time */
listen
(
sockfd
,
5
);
clilen
=
sizeof
(
cli_addr
);
}
};
}
src/main/resources/SMOR/src/Transducer.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* File: Transducer.h */
/* Author: Helmut Schmid */
/* */
/*******************************************************************/
#include <stdio.h>
#include <vector>
namespace
SFST
{
class
Transition
{
public:
char
lower
;
char
upper
;
int
target
;
Transition
(
char
l
,
char
u
,
size_t
t
)
{
lower
=
l
;
upper
=
u
;
target
=
t
;
};
};
class
State
{
public:
bool
final
;
vector
<
Transition
>
transition
;
State
()
{
final
=
false
;
};
};
class
Transducer
{
private:
vector
<
State
>
state
;
void
analyze1
(
int
sn
,
const
char
*
s
,
vector
<
char
>
&
ana
,
vector
<
vector
<
char
>
>
&
analyses
)
{
if
(
*
s
==
0
&&
state
[
sn
].
final
)
analyses
.
push_back
(
ana
);
vector
<
Transition
>
&
t
=
state
[
sn
].
transition
;
for
(
size_t
i
=
0
;
i
<
t
.
size
();
i
++
)
{
if
(
t
[
i
].
upper
==
0
)
{
ana
.
push_back
(
t
[
i
].
lower
);
analyze1
(
t
[
i
].
target
,
s
,
ana
,
analyses
);
ana
.
pop_back
();
}
else
if
(
t
[
i
].
upper
==
*
s
)
{
ana
.
push_back
(
t
[
i
].
lower
);
analyze1
(
t
[
i
].
target
,
s
+
1
,
ana
,
analyses
);
ana
.
pop_back
();
}
}
}
public:
Transducer
(
FILE
*
file
)
{
char
buffer
[
1000
];
for
(
unsigned
int
line
=
0
;
(
fgets
(
buffer
,
1000
,
file
));
line
++
)
{
int
s
,
t
;
char
u
,
l
;
if
(
sscanf
(
buffer
,
"final: %d"
,
&
s
)
==
1
)
{
if
(
s
>=
(
int
)
state
.
size
())
state
.
resize
(
s
+
1
);
state
[
s
].
final
=
true
;
}
else
if
(
sscanf
(
buffer
,
"%d %c:<> %d"
,
&
s
,
&
l
,
&
t
)
==
3
)
{
if
(
s
>=
(
int
)
state
.
size
())
state
.
resize
(
s
+
1
);
state
[
s
].
transition
.
push_back
(
Transition
(
l
,
0
,
t
));
}
else
if
(
sscanf
(
buffer
,
"%d <>:%c %d"
,
&
s
,
&
u
,
&
t
)
==
3
)
{
if
(
s
>=
(
int
)
state
.
size
())
state
.
resize
(
s
+
1
);
state
[
s
].
transition
.
push_back
(
Transition
(
0
,
u
,
t
));
}
else
if
(
sscanf
(
buffer
,
"%d %c:%c %d"
,
&
s
,
&
l
,
&
u
,
&
t
)
==
4
)
{
if
(
s
>=
(
int
)
state
.
size
())
state
.
resize
(
s
+
1
);
state
[
s
].
transition
.
push_back
(
Transition
(
l
,
u
,
t
));
}
else
if
(
sscanf
(
buffer
,
"%d %c %d"
,
&
s
,
&
l
,
&
t
)
==
3
)
{
if
(
s
>=
(
int
)
state
.
size
())
state
.
resize
(
s
+
1
);
state
[
s
].
transition
.
push_back
(
Transition
(
l
,
l
,
t
));
}
else
{
fprintf
(
stderr
,
"Error: in line %u of transducer file at: %s
\n
"
,
line
,
buffer
);
exit
(
1
);
}
}
}
void
analyze
(
const
char
*
s
,
vector
<
vector
<
char
>
>
&
analyses
)
{
vector
<
char
>
ana
;
analyze1
(
0
,
s
,
ana
,
analyses
);
}
};
}
src/main/resources/SMOR/src/TransducerTest.C
0 → 100755
View file @
0b611765
#include "Transducer.h"
#include <iostream>
using
std
::
cerr
;
int
main
(
int
argc
,
char
**
argv
)
{
FILE
*
file
;
file
=
fopen
(
argv
[
1
],
"rb"
);
// open the input file
if
(
file
==
NULL
)
exit
(
1
);
try
{
Transducer
transducer
(
file
);
// (1) read the transducer
char
buffer
[
1000
];
while
(
fgets
(
buffer
,
1000
,
stdin
))
{
// (2) next input line
// delete newline character
int
l
=
strlen
(
buffer
)
-
1
;
if
(
buffer
[
l
]
==
'\n'
)
buffer
[
l
]
=
'\0'
;
printf
(
"> %s
\n
"
,
buffer
);
// print the input line
vector
<
vector
<
char
>
>
analyses
;
transducer
.
analyze
(
buffer
,
analyses
);
// (3) analyse the input
if
(
analyses
.
size
()
==
0
)
printf
(
"no result for %s
\n
"
,
buffer
);
else
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
for
(
size_t
k
=
0
;
k
<
analyses
[
i
].
size
();
k
++
)
fputc
(
analyses
[
i
][
k
],
stdout
);
fputc
(
'\n'
,
stdout
);
}
}
}
catch
(
const
char
*
p
)
{
cerr
<<
p
<<
"
\n
"
;
return
1
;
}
return
0
;
}
src/main/resources/SMOR/src/alphabet.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE alphabet.C */
/* MODULE alphabet */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE basic FST functions */
/* */
/*******************************************************************/
#include <climits>
#include <cstring>
#include "utf8.h"
#include "alphabet.h"
namespace
SFST
{
using
std
::
vector
;
using
std
::
ostream
;
const
int
BUFFER_SIZE
=
100000
;
char
EpsilonString
[]
=
"<>"
;
/*******************************************************************/
/* */
/* Alphabet::add */
/* */
/*******************************************************************/
void
Alphabet
::
add
(
const
char
*
symbol
,
Character
c
)
{
char
*
s
=
fst_strdup
(
symbol
);
cm
[
c
]
=
s
;
sm
[
s
]
=
c
;
}
/*******************************************************************/
/* */
/* Alphabet::Alphabet */
/* */
/*******************************************************************/
Alphabet
::
Alphabet
()
{
utf8
=
false
;
add
(
EpsilonString
,
Label
::
epsilon
);
}
/*******************************************************************/
/* */
/* Alphabet::clear */
/* */
/*******************************************************************/
void
Alphabet
::
clear
()
{
char
**
s
=
new
char
*
[
cm
.
size
()];
ls
.
clear
();
sm
.
clear
();
size_t
i
,
n
=
0
;
for
(
CharMap
::
iterator
it
=
cm
.
begin
();
it
!=
cm
.
end
();
it
++
)
s
[
n
++
]
=
it
->
second
;
cm
.
clear
();
for
(
i
=
0
;
i
<
n
;
i
++
)
free
(
s
[
i
]);
delete
[]
s
;
}
/*******************************************************************/
/* */
/* Alphabet::print */
/* */
/*******************************************************************/
void
Alphabet
::
print
(
void
)
{
for
(
CharMap
::
iterator
it
=
cm
.
begin
();
it
!=
cm
.
end
();
it
++
)
fprintf
(
stderr
,
"%i
\t
%s
\n
"
,
it
->
first
,
it
->
second
);
return
;
}
/*******************************************************************/
/* */
/* Alphabet::new_marker */
/* */
/*******************************************************************/
Character
Alphabet
::
new_marker
()
{
// find some unused character code
for
(
Character
i
=
1
;
i
!=
0
;
i
++
)
if
(
cm
.
find
(
i
)
==
cm
.
end
())
{
// create a unique identifier string
char
symbol
[
100
];
sprintf
(
symbol
,
">%ld<"
,(
long
)
i
);
add
(
symbol
,
i
);
return
i
;
}
throw
"Error: too many symbols in transducer definition"
;
}
/*******************************************************************/
/* */
/* is_marker_symbol */
/* */
/*******************************************************************/
static
bool
is_marker_symbol
(
const
char
*
s
)
{
// recogize strings matching the expression ">[0-9]+<"
if
(
s
!=
NULL
&&
*
s
==
'>'
)
{
do
{
s
++
;
}
while
(
*
s
>=
'0'
&&
*
s
<=
'9'
);
if
(
*
s
==
'<'
&&
*
(
s
+
1
)
==
0
&&
*
(
s
-
1
)
!=
'>'
)
return
true
;
}
return
false
;
}
/*******************************************************************/
/* */
/* Alphabet::delete_markers */
/* */
/*******************************************************************/
void
Alphabet
::
delete_markers
()
{
vector
<
char
*>
sym
;
vector
<
Character
>
code
;
vector
<
Label
>
label
;
for
(
CharMap
::
const_iterator
it
=
cm
.
begin
();
it
!=
cm
.
end
();
it
++
)
{
Character
c
=
it
->
first
;
char
*
s
=
it
->
second
;
if
(
!
is_marker_symbol
(
s
))
{
sym
.
push_back
(
fst_strdup
(
s
));
code
.
push_back
(
c
);
}
}
for
(
LabelSet
::
const_iterator
it
=
begin
();
it
!=
end
();
it
++
)
{
Label
l
=*
it
;
if
(
!
is_marker_symbol
(
code2symbol
(
l
.
upper_char
()))
&&
!
is_marker_symbol
(
code2symbol
(
l
.
lower_char
())))
label
.
push_back
(
l
);
}
clear
();
for
(
size_t
i
=
0
;
i
<
sym
.
size
();
i
++
)
{
add_symbol
(
sym
[
i
],
code
[
i
]);
free
(
sym
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
label
.
size
();
i
++
)
insert
(
label
[
i
]
);
}
/*******************************************************************/
/* */
/* Alphabet::add_symbol */
/* */
/*******************************************************************/
Character
Alphabet
::
add_symbol
(
const
char
*
symbol
)
{
if
(
sm
.
find
(
symbol
)
!=
sm
.
end
())
return
sm
[
symbol
];
// assign the symbol to some unused character
for
(
Character
i
=
1
;
i
!=
0
;
i
++
)
if
(
cm
.
find
(
i
)
==
cm
.
end
())
{
add
(
symbol
,
i
);
return
i
;
}
throw
"Error: too many symbols in transducer definition"
;
}
/*******************************************************************/
/* */
/* Alphabet::add_symbol */
/* */
/*******************************************************************/
void
Alphabet
::
add_symbol
(
const
char
*
symbol
,
Character
c
)
{
// check whether the symbol was previously defined
int
sc
=
symbol2code
(
symbol
);
if
(
sc
!=
EOF
)
{
if
((
Character
)
sc
==
c
)
return
;
if
(
strlen
(
symbol
)
<
60
)
{
static
char
message
[
100
];
sprintf
(
message
,
"Error: reinserting symbol '%s' in alphabet with incompatible character value %u %u"
,
symbol
,
(
unsigned
)
sc
,
(
unsigned
)
c
);
throw
message
;
}
else
throw
"reinserting symbol in alphabet with incompatible character value"
;
}
// check whether the character is already in use
const
char
*
s
=
code2symbol
(
c
);
if
(
s
==
NULL
)
add
(
symbol
,
c
);
else
{
if
(
strcmp
(
s
,
symbol
)
!=
0
)
{
static
char
message
[
100
];
if
(
strlen
(
symbol
)
<
70
)
sprintf
(
message
,
"Error: defining symbol %s as character %d (previously defined as %s)"
,
symbol
,
(
unsigned
)
c
,
s
);
else
sprintf
(
message
,
"Error: defining a (very long) symbol with previously used character"
);
throw
message
;
}
}
}
/*******************************************************************/
/* */
/* Alphabet::write_char */
/* */
/*******************************************************************/
void
Alphabet
::
write_char
(
Character
c
,
char
*
buffer
,
int
*
pos
,
bool
with_brackets
)
const
{
const
char
*
s
=
code2symbol
(
c
);
if
(
s
)
{
int
i
=
0
;
int
l
=
(
int
)
strlen
(
s
)
-
1
;
if
(
!
with_brackets
&&
s
[
i
]
==
'<'
&&
s
[
l
]
==
'>'
)
{
i
++
;
l
--
;
}
while
(
i
<=
l
)
buffer
[(
*
pos
)
++
]
=
s
[
i
++
];
}
else
{
unsigned
int
uc
=
c
;
if
(
uc
>=
32
&&
uc
<
256
)
buffer
[(
*
pos
)
++
]
=
(
char
)
c
;
else
{
sprintf
(
buffer
+
(
*
pos
),
"
\\
%u"
,
uc
);
*
pos
+=
(
int
)
strlen
(
buffer
+
(
*
pos
));
}
}
buffer
[
*
pos
]
=
'\0'
;
}
/*******************************************************************/
/* */
/* Alphabet::write_char */
/* */
/*******************************************************************/
const
char
*
Alphabet
::
write_char
(
Character
c
,
bool
with_brackets
)
const
{
static
char
buffer
[
1000
];
int
n
=
0
;
write_char
(
c
,
buffer
,
&
n
,
with_brackets
);
return
buffer
;
}
/*******************************************************************/
/* */
/* Alphabet::write_label */
/* */
/*******************************************************************/
void
Alphabet
::
write_label
(
Label
l
,
char
*
buffer
,
int
*
pos
,
bool
with_brackets
)
const
{
Character
lc
=
l
.
lower_char
();
Character
uc
=
l
.
upper_char
();
write_char
(
lc
,
buffer
,
pos
,
with_brackets
);
if
(
lc
!=
uc
)
{
buffer
[(
*
pos
)
++
]
=
':'
;
write_char
(
uc
,
buffer
,
pos
,
with_brackets
);
}
}
/*******************************************************************/
/* */
/* Alphabet::write_label */
/* */
/*******************************************************************/
const
char
*
Alphabet
::
write_label
(
Label
l
,
bool
with_brackets
)
const
{
static
char
buffer
[
1000
];
int
n
=
0
;
write_label
(
l
,
buffer
,
&
n
,
with_brackets
);
return
buffer
;
}
/*******************************************************************/
/* */
/* Alphabet::insert_symbols */
/* */
/*******************************************************************/
void
Alphabet
::
insert_symbols
(
const
Alphabet
&
a
)
{
for
(
CharMap
::
const_iterator
it
=
a
.
cm
.
begin
();
it
!=
a
.
cm
.
end
();
it
++
)
add_symbol
(
it
->
second
,
it
->
first
);
}
/*******************************************************************/
/* */
/* Alphabet::complement */
/* */
/*******************************************************************/
void
Alphabet
::
complement
(
vector
<
Character
>
&
sym
)
{
vector
<
Character
>
result
;
for
(
CharMap
::
const_iterator
it
=
cm
.
begin
();
it
!=
cm
.
end
();
it
++
)
{
Character
c
=
it
->
first
;
if
(
c
!=
Label
::
epsilon
)
{
size_t
i
;
for
(
i
=
0
;
i
<
sym
.
size
();
i
++
)
if
(
sym
[
i
]
==
c
)
break
;
if
(
i
==
sym
.
size
())
result
.
push_back
(
c
);
}
}
sym
.
swap
(
result
);
}
/*******************************************************************/
/* */
/* Alphabet::copy */
/* */
/*******************************************************************/
void
Alphabet
::
copy
(
const
Alphabet
&
a
)
{
insert_symbols
(
a
);
utf8
=
a
.
utf8
;
for
(
LabelSet
::
const_iterator
it
=
a
.
begin
();
it
!=
a
.
end
();
it
++
)
ls
.
insert
(
*
it
);
}
/*******************************************************************/
/* */
/* Alphabet::compose */
/* */
/*******************************************************************/
void
Alphabet
::
compose
(
const
Alphabet
&
la
,
const
Alphabet
&
ua
)
{
// insert the symbols
insert_symbols
(
la
);
insert_symbols
(
ua
);
utf8
=
la
.
utf8
;
hash_map
<
Character
,
hash_set
<
Character
>
>
cs
;
// create a hash table for a quick lookup of the target characters
for
(
iterator
it
=
ua
.
begin
();
it
!=
ua
.
end
();
it
++
)
{
Character
lc
=
it
->
lower_char
();
if
(
lc
==
Label
::
epsilon
)
insert
(
*
it
);
else
cs
[
lc
].
insert
(
it
->
upper_char
());
}
for
(
iterator
it
=
la
.
begin
();
it
!=
la
.
end
();
it
++
)
{
Character
uc
=
it
->
upper_char
();
if
(
uc
==
Label
::
epsilon
)
insert
(
*
it
);
else
{
if
(
cs
.
find
(
uc
)
!=
cs
.
end
())
{
hash_set
<
Character
>
s
=
cs
[
uc
];
Character
lc
=
it
->
lower_char
();
for
(
hash_set
<
Character
>::
iterator
it
=
s
.
begin
();
it
!=
s
.
end
();
it
++
)
insert
(
Label
(
lc
,
*
it
));
}
}
}
}
/*******************************************************************/
/* */
/* operator<<(Alphabet) */
/* */
/*******************************************************************/
ostream
&
operator
<<
(
ostream
&
s
,
const
Alphabet
&
a
)
{
for
(
Alphabet
::
CharMap
::
const_iterator
it
=
a
.
cm
.
begin
();
it
!=
a
.
cm
.
end
();
it
++
)
s
<<
it
->
first
<<
" -> "
<<
it
->
second
<<
"
\n
"
;
for
(
Alphabet
::
iterator
it
=
a
.
begin
();
it
!=
a
.
end
();
it
++
)
s
<<
a
.
write_label
(
*
it
)
<<
" "
;
s
<<
"
\n
"
;
return
s
;
}
/*******************************************************************/
/* */
/* Alphabet::next_mcsym */
/* */
/* recognizes multi-character symbols which are enclosed with */
/* angle brackets <...>. If the argument flag insert is true, */
/* the multi-character symbol must be already in the lexicon in */
/* order to be recognized. */
/* */
/*******************************************************************/
int
Alphabet
::
next_mcsym
(
char
*
&
string
,
bool
insert
)
{
char
*
start
=
string
;
if
(
*
start
==
'<'
)
// symbol might start here
for
(
char
*
end
=
start
+
1
;
*
end
;
end
++
)
if
(
*
end
==
'>'
)
{
// matching pair of angle brackets found
// mark the end of the substring with \0
char
lastc
=
*
(
++
end
);
*
end
=
0
;
int
c
;
if
(
insert
)
c
=
add_symbol
(
start
);
else
c
=
symbol2code
(
start
);
// restore the original string
*
end
=
lastc
;
if
(
c
!=
EOF
)
{
// symbol found
// return its code
string
=
end
;
return
(
Character
)
c
;
}
else
// not a complex character
break
;
}
return
EOF
;
}
/*******************************************************************/
/* */
/* Alphabet::next_code */
/* */
/*******************************************************************/
int
Alphabet
::
next_code
(
char
*
&
string
,
bool
extended
,
bool
insert
)
{
if
(
*
string
==
0
)
return
EOF
;
// finished
int
c
=
next_mcsym
(
string
,
insert
);
if
(
c
!=
EOF
)
return
c
;
if
(
extended
&&
*
string
==
'\\'
)
string
++
;
// remove quotation
if
(
utf8
)
{
unsigned
int
c
=
utf8toint
(
&
string
);
if
(
c
==
0
)
{
fprintf
(
stderr
,
"Error in UTF-8 encoding!
\n
"
);
return
EOF
;
// error encountered in utf8 character
}
return
(
int
)
add_symbol
(
int2utf8
(
c
));
}
else
{
char
buffer
[
2
];
buffer
[
0
]
=
*
string
;
buffer
[
1
]
=
0
;
string
++
;
return
(
int
)
add_symbol
(
buffer
);
}
}
/*******************************************************************/
/* */
/* Alphabet::next_label */
/* */
/*******************************************************************/
Label
Alphabet
::
next_label
(
char
*
&
string
,
bool
extended
)
{
// read first character
int
c
=
next_code
(
string
,
extended
);
if
(
c
==
EOF
)
return
Label
();
// end of string reached
Character
lc
=
(
Character
)
c
;
if
(
!
extended
||
*
string
!=
':'
)
{
// single character?
if
(
lc
==
Label
::
epsilon
)
return
next_label
(
string
,
extended
);
// ignore epsilon
return
Label
(
lc
);
}
// read second character
string
++
;
// jump over ':'
c
=
next_code
(
string
);
if
(
c
==
EOF
)
{
static
char
buffer
[
1000
];
sprintf
(
buffer
,
"Error: incomplete symbol in input file: %s"
,
string
);
throw
buffer
;
}
Label
l
(
lc
,
(
Character
)
c
);
if
(
l
.
is_epsilon
())
return
next_label
(
string
,
extended
);
// ignore epsilon transitions
return
l
;
}
/*******************************************************************/
/* */
/* Alphabet::string2symseq */
/* */
/*******************************************************************/
void
Alphabet
::
string2symseq
(
char
*
s
,
vector
<
Character
>
&
ch
)
{
int
c
;
while
((
c
=
next_code
(
s
,
false
))
!=
EOF
)
ch
.
push_back
((
Character
)
c
);
}
/*******************************************************************/
/* */
/* Alphabet::string2labelseq */
/* */
/*******************************************************************/
void
Alphabet
::
string2labelseq
(
char
*
s
,
vector
<
Label
>
&
labels
)
{
Label
l
;
while
((
l
=
next_label
(
s
))
!=
Label
::
epsilon
)
labels
.
push_back
(
l
);
}
/*******************************************************************/
/* */
/* Alphabet::store */
/* */
/*******************************************************************/
void
Alphabet
::
store
(
FILE
*
file
)
const
{
char
c
=
(
utf8
)
?
(
char
)
1
:
(
char
)
0
;
fputc
(
c
,
file
);
// write the symbol mapping
Character
n
=
(
Character
)
cm
.
size
();
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
for
(
CharMap
::
const_iterator
it
=
cm
.
begin
();
it
!=
cm
.
end
();
it
++
)
{
Character
c
=
it
->
first
;
char
*
s
=
it
->
second
;
fwrite
(
&
c
,
sizeof
(
c
),
1
,
file
);
fwrite
(
s
,
sizeof
(
char
),
strlen
(
s
)
+
1
,
file
);
}
// write the character pairs
n
=
(
Character
)
size
();
fwrite
(
&
n
,
sizeof
(
n
),
1
,
file
);
for
(
LabelSet
::
const_iterator
p
=
ls
.
begin
();
p
!=
ls
.
end
();
p
++
)
{
Character
c
=
p
->
lower_char
();
fwrite
(
&
c
,
sizeof
(
c
),
1
,
file
);
c
=
p
->
upper_char
();
fwrite
(
&
c
,
sizeof
(
c
),
1
,
file
);
}
if
(
ferror
(
file
))
throw
"Error encountered while writing alphabet to file
\n
"
;
}
/*******************************************************************/
/* */
/* Alphabet::read */
/* */
/*******************************************************************/
void
Alphabet
::
read
(
FILE
*
file
)
{
utf8
=
(
fgetc
(
file
)
!=
0
);
// read the symbol mapping
Character
n
=
0
;
read_num
(
&
n
,
sizeof
(
n
),
file
);
for
(
unsigned
i
=
0
;
i
<
n
;
i
++
)
{
char
buffer
[
BUFFER_SIZE
];
Character
c
;
read_num
(
&
c
,
sizeof
(
c
),
file
);
if
(
!
read_string
(
buffer
,
BUFFER_SIZE
,
file
)
||
feof
(
file
)
||
ferror
(
file
))
throw
"Error1 occurred while reading alphabet!
\n
"
;
add_symbol
(
buffer
,
c
);
}
// read the character pairs
read_num
(
&
n
,
sizeof
(
n
),
file
);
if
(
ferror
(
file
))
throw
"Error2 occurred while reading alphabet!
\n
"
;
for
(
unsigned
i
=
0
;
i
<
n
;
i
++
)
{
Character
lc
,
uc
;
read_num
(
&
lc
,
sizeof
(
lc
),
file
);
read_num
(
&
uc
,
sizeof
(
uc
),
file
);
insert
(
Label
(
lc
,
uc
));
}
if
(
ferror
(
file
))
throw
"Error3 occurred while reading alphabet!
\n
"
;
}
/*******************************************************************/
/* */
/* Alphabet::compute_score */
/* */
/*******************************************************************/
int
Alphabet
::
compute_score
(
Analysis
&
ana
)
{
// check whether the morpheme boundaries are explicitly marked
// with <X> tags
int
score
=
0
;
for
(
size_t
i
=
0
;
i
<
ana
.
size
();
i
++
)
{
// get next symbol
const
char
*
sym
=
write_char
(
ana
[
i
].
lower_char
());
if
(
strcmp
(
sym
,
"<X>"
)
==
0
)
score
--
;
}
if
(
score
<
0
)
return
score
;
// No explicit morpheme boundary markers have been found.
// Count the number of part-of-speech and PREF tags.
for
(
size_t
i
=
0
;
i
<
ana
.
size
();
i
++
)
{
// get next symbol
const
char
*
sym
=
write_char
(
ana
[
i
].
lower_char
());
// Is it not a multi-character symbol
if
(
sym
[
0
]
!=
'<'
||
sym
[
1
]
==
0
)
continue
;
// Is it a POS tag starting with "+" like <+NN>?
if
(
sym
[
1
]
==
'+'
)
{
const
char
*
t
=
sym
+
2
;
for
(
;
*
t
>=
'A'
&&
*
t
<=
'Z'
;
t
++
)
;
if
(
t
>
sym
+
2
&&
*
t
==
'>'
)
return
score
;
}
// Is it a potential POS tag (i.e. all uppercase)?
const
char
*
t
=
sym
+
1
;
for
(
;
*
t
>=
'A'
&&
*
t
<=
'Z'
;
t
++
)
;
if
(
t
==
sym
+
1
||
*
t
!=
'>'
)
continue
;
// uppercase symbol found
if
(
strcmp
(
sym
,
"<SUFF>"
)
==
0
||
strcmp
(
sym
,
"<OLDORTH>"
)
==
0
||
strcmp
(
sym
,
"<NEWORTH>"
)
==
0
)
continue
;
// not what we are looking for
// disprefer nouns with prefixes
if
(
strcmp
(
sym
,
"<PREF>"
)
==
0
)
score
-=
2
;
if
(
strcmp
(
sym
,
"<V>"
)
==
0
||
strcmp
(
sym
,
"<ADJ>"
)
==
0
)
{
bool
is_verb
=
(
strcmp
(
sym
,
"<V>"
)
==
0
);
// get the next non-empty symbol
Character
c
=
Label
::
epsilon
;
size_t
k
;
for
(
k
=
i
+
1
;
k
<
ana
.
size
();
k
++
)
if
((
c
=
ana
[
k
].
lower_char
())
!=
Label
::
epsilon
)
break
;
// Is it a participle
if
(
c
!=
Label
::
epsilon
)
{
sym
=
write_char
(
c
);
if
(
strcmp
(
sym
,
"<OLDORTH>"
)
==
0
||
strcmp
(
sym
,
"<NEWORTH>"
)
==
0
||
strcmp
(
sym
,
"<SUFF>"
)
==
0
)
{
for
(
k
++
;
k
<
ana
.
size
();
k
++
)
if
((
c
=
ana
[
k
].
lower_char
())
!=
Label
::
epsilon
)
break
;
if
(
c
!=
Label
::
epsilon
)
sym
=
write_char
(
c
);
}
if
(
is_verb
&&
(
strcmp
(
sym
,
"<PPres>"
)
==
0
||
strcmp
(
sym
,
"<PPast>"
)
==
0
))
continue
;
// don't consider participles as complex
if
(
!
is_verb
&&
(
strcmp
(
sym
,
"<Sup>"
)
==
0
||
strcmp
(
sym
,
"<Comp>"
)
==
0
))
continue
;
// don't consider participles as complex
}
}
score
--
;
}
return
score
;
}
/*******************************************************************/
/* */
/* Alphabet::disambiguate */
/* */
/*******************************************************************/
void
Alphabet
::
disambiguate
(
vector
<
Analysis
>
&
analyses
)
{
// compute the scores
int
bestscore
=
INT_MIN
;
vector
<
int
>
score
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
score
.
push_back
(
compute_score
(
analyses
[
i
]));
if
(
bestscore
<
score
[
i
])
bestscore
=
score
[
i
];
}
// delete suboptimal analyses
size_t
k
=
0
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
if
(
score
[
i
]
==
bestscore
)
analyses
[
k
++
]
=
analyses
[
i
];
analyses
.
resize
(
k
);
}
/*******************************************************************/
/* */
/* Alphabet::print_analysis */
/* */
/*******************************************************************/
char
*
Alphabet
::
print_analysis
(
Analysis
&
ana
,
bool
both_layers
)
{
vector
<
char
>
ch
;
// for each transition
for
(
size_t
i
=
0
;
i
<
ana
.
size
();
i
++
)
{
// get the transition label
Label
l
=
ana
[
i
];
const
char
*
s
;
// either print the analysis symbol or the whole label
if
(
both_layers
)
{
s
=
write_label
(
l
);
// quote colons
if
(
strcmp
(
s
,
":"
)
==
0
)
ch
.
push_back
(
'\\'
);
}
else
if
(
l
.
lower_char
()
!=
Label
::
epsilon
)
s
=
write_char
(
l
.
lower_char
());
else
continue
;
// copy the characters to the character array
while
(
*
s
)
ch
.
push_back
(
*
(
s
++
));
}
ch
.
push_back
(
0
);
// terminate the string
static
char
*
result
=
NULL
;
if
(
result
!=
NULL
)
delete
[]
result
;
result
=
new
char
[
ch
.
size
()];
for
(
size_t
i
=
0
;
i
<
ch
.
size
();
i
++
)
result
[
i
]
=
ch
[
i
];
return
result
;
}
/*******************************************************************/
/* */
/* Alphabet::operator== */
/* */
/*******************************************************************/
bool
Alphabet
::
operator
==
(
const
Alphabet
&
alpha
)
const
{
for
(
SymbolMap
::
const_iterator
it
=
this
->
sm
.
begin
();
it
!=
this
->
sm
.
end
();
it
++
)
{
SymbolMap
::
const_iterator
alpha_it
=
alpha
.
sm
.
find
(
it
->
first
);
if
(
alpha_it
==
alpha
.
sm
.
end
()
)
return
false
;
if
(
alpha_it
->
second
==
it
->
second
)
return
false
;
}
for
(
SymbolMap
::
const_iterator
alpha_it
=
alpha
.
sm
.
begin
();
alpha_it
!=
alpha
.
sm
.
end
();
alpha_it
++
)
{
SymbolMap
::
const_iterator
it
=
this
->
sm
.
find
(
alpha_it
->
first
);
if
(
it
==
this
->
sm
.
end
()
)
return
false
;
if
(
it
->
second
==
alpha_it
->
second
)
return
false
;
}
return
true
;
}
}
src/main/resources/SMOR/src/alphabet.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE alphabet.h */
/* MODULE alphabet */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _ALPHABET_H_
#define _ALPHABET_H_
#include <stdio.h>
#include "basic.h"
#include <set>
#include <vector>
#include <iostream>
#include <cstring>
#include "sgi.h"
namespace
SFST
{
#ifndef CODE_DATA_TYPE
typedef
unsigned
short
Character
;
// data type of the symbol codes
#else
typedef
unsigned
CODE_DATA_TYPE
Character
;
#endif
// data type used to indicate whether some action is to be performed
// on the analysis level (lower) or the surface level (upper)
typedef
enum
{
upper
,
lower
}
Level
;
/***************** class Label ***********************************/
class
Label
{
private:
// data structure where the two symbols are stored
struct
{
Character
lower
;
Character
upper
;
}
label
;
public:
static
const
Character
epsilon
=
0
;
// code of the empty symbol
// new label with two identical symbols
Label
(
Character
c
=
epsilon
)
{
label
.
lower
=
label
.
upper
=
c
;
};
// new label with two different symbols
Label
(
Character
c1
,
Character
c2
)
{
label
.
lower
=
c1
;
label
.
upper
=
c2
;
};
// returns the indicated symbol of the label
Character
get_char
(
Level
l
)
const
{
return
((
l
==
upper
)
?
label
.
upper
:
label
.
lower
);
};
// returns the "upper" symbol of the label (i.e. the surface symbol)
Character
upper_char
()
const
{
return
label
.
upper
;
};
// returns the "lower" symbol of the label (i.e. the analysis symbol)
Character
lower_char
()
const
{
return
label
.
lower
;
};
// replaces symbols in a label
Label
replace_char
(
Character
c
,
Character
nc
)
const
{
Label
l
=
*
this
;
if
(
l
.
label
.
lower
==
c
)
l
.
label
.
lower
=
nc
;
if
(
l
.
label
.
upper
==
c
)
l
.
label
.
upper
=
nc
;
return
l
;
};
// operators checking the equality of labels
int
operator
==
(
Label
l
)
const
{
return
(
label
.
lower
==
l
.
label
.
lower
&&
label
.
upper
==
l
.
label
.
upper
);
};
int
operator
!=
(
Label
l
)
const
{
return
!
(
l
==
*
this
);
};
// comparison operator needed for sorting labels in compact.C
int
operator
<
(
Label
l
)
const
{
if
(
upper_char
()
<
l
.
upper_char
())
return
true
;
if
(
upper_char
()
>
l
.
upper_char
())
return
false
;
if
(
lower_char
()
<
l
.
lower_char
())
return
true
;
return
false
;
};
int
operator
>
(
Label
l
)
const
{
if
(
upper_char
()
>
l
.
upper_char
())
return
true
;
if
(
upper_char
()
<
l
.
upper_char
())
return
false
;
if
(
lower_char
()
>
l
.
lower_char
())
return
true
;
return
false
;
};
// check whether the label is epsilon (i.e. both symbols are epsilon)
// transitions with epsilon labels are epsilon transitions
int
is_epsilon
()
const
{
return
(
label
.
upper
==
epsilon
&&
label
.
lower
==
epsilon
);
};
// check whether the "upper" symbol is epsilon
int
upper_is_epsilon
()
const
{
return
(
label
.
upper
==
epsilon
);
};
// check whether the "lower" symbol is epsilon
int
lower_is_epsilon
()
const
{
return
(
label
.
lower
==
epsilon
);
};
// hash function needed to store labels in a hash table
struct
label_hash
{
size_t
operator
()
(
const
Label
l
)
const
{
return
(
size_t
)
l
.
lower_char
()
^
((
size_t
)
l
.
upper_char
()
<<
16
)
^
((
size_t
)
l
.
upper_char
()
>>
16
);
}
};
// hash function needed to store labels in a hash table
struct
label_cmp
{
bool
operator
()
(
const
Label
l1
,
const
Label
l2
)
const
{
return
(
l1
.
lower_char
()
<
l2
.
lower_char
()
||
(
l1
.
lower_char
()
==
l2
.
lower_char
()
&&
l1
.
upper_char
()
<
l2
.
upper_char
()));
}
};
// comparison operator needed to store labels in a hash table
struct
label_eq
{
bool
operator
()
(
const
Label
l1
,
const
Label
l2
)
const
{
return
(
l1
.
lower_char
()
==
l2
.
lower_char
()
&&
l1
.
upper_char
()
==
l2
.
upper_char
());
}
};
};
typedef
std
::
vector
<
Label
>
Analysis
;
/***************** class Alphabet *******************************/
class
Alphabet
{
// string comparison operators needed to stored strings in a hash table
struct
eqstr
{
bool
operator
()(
const
char
*
s1
,
const
char
*
s2
)
const
{
return
strcmp
(
s1
,
s2
)
==
0
;
}
};
// data structure storing labels without repetitions (i.e. as a set)
typedef
std
::
set
<
Label
,
Label
::
label_cmp
>
LabelSet
;
// hash table used to map the symbols to their codes
typedef
hash_map
<
const
char
*
,
Character
,
hash
<
const
char
*>
,
eqstr
>
SymbolMap
;
public:
// HFST addition
// hash table used to map the codes back to the symbols
typedef
hash_map
<
Character
,
char
*>
CharMap
;
// HFST addition
bool
operator
==
(
const
Alphabet
&
alpha
)
const
;
private:
SymbolMap
sm
;
// maps symbols to codes
CharMap
cm
;
// maps codes to symbols
LabelSet
ls
;
// set of labels known to the alphabet
// add a new symbol with symbol code c
void
add
(
const
char
*
symbol
,
Character
c
);
public:
bool
utf8
;
// iterators over the set of known labels
typedef
LabelSet
::
iterator
iterator
;
typedef
LabelSet
::
const_iterator
const_iterator
;
Alphabet
();
~
Alphabet
()
{
clear
();
};
const_iterator
begin
()
const
{
return
ls
.
begin
();
};
const_iterator
end
()
const
{
return
ls
.
end
();
};
size_t
size
()
const
{
return
ls
.
size
();
};
// HFST additions
CharMap
get_char_map
(
void
)
{
return
cm
;
};
void
print
(
void
);
void
clear
();
void
clear_char_pairs
()
{
ls
.
clear
();
};
// lookup a label in the alphabet
iterator
find
(
Label
l
)
{
return
ls
.
find
(
l
);
};
// insert a label in the alphabet
void
insert
(
Label
l
)
{
if
(
!
l
.
is_epsilon
())
ls
.
insert
(
l
);
};
// insert the known symbols from another alphabet
void
insert_symbols
(
const
Alphabet
&
);
// insert the labels and known symbols from another alphabet
void
copy
(
const
Alphabet
&
);
// create the alphabet of a transducer obtained by a composition operation
void
compose
(
const
Alphabet
&
la
,
const
Alphabet
&
ua
);
// add a symbol to the alphabet and return its code
Character
add_symbol
(
const
char
*
symbol
);
// add a symbol to the alphabet with a given code
void
add_symbol
(
const
char
*
symbol
,
Character
c
);
// create a new marker symbol and return its code
Character
new_marker
(
void
);
void
delete_markers
();
// compute the complement of a symbol set
void
complement
(
std
::
vector
<
Character
>
&
sym
);
// return the code of the argument symbol
int
symbol2code
(
const
char
*
s
)
const
{
SymbolMap
::
const_iterator
p
=
sm
.
find
(
s
);
if
(
p
!=
sm
.
end
())
return
p
->
second
;
return
EOF
;
};
// return the symbol for the given symbol code
const
char
*
code2symbol
(
Character
c
)
const
{
CharMap
::
const_iterator
p
=
cm
.
find
(
c
);
if
(
p
==
cm
.
end
())
return
NULL
;
else
return
p
->
second
;
};
// write the symbol for the given symbol code into a string
void
write_char
(
Character
c
,
char
*
buffer
,
int
*
pos
,
bool
with_brackets
=
true
)
const
;
// write the symbol pair of a given label into a string
void
write_label
(
Label
l
,
char
*
buffer
,
int
*
pos
,
bool
with_brackets
=
true
)
const
;
// write the symbol for the given symbol code into a buffer and return
// a pointer to it
// the flag "with_brackets" indicates whether the angle brackets
// surrounding multi-character symbols are to be printed or not
const
char
*
write_char
(
Character
c
,
bool
with_brackets
=
true
)
const
;
// write the symbol pair of a given label into a string
// and return a pointer to it
const
char
*
write_label
(
Label
l
,
bool
with_brackets
=
true
)
const
;
// scan the next multi-character symbol in the argument string
int
next_mcsym
(
char
*&
,
bool
insert
=
true
);
// scan the next symbol in the argument string
int
next_code
(
char
*&
,
bool
extended
=
true
,
bool
insert
=
true
);
// convert a character string into a symbol or label sequence
void
string2symseq
(
char
*
,
std
::
vector
<
Character
>&
);
void
string2labelseq
(
char
*
,
std
::
vector
<
Label
>&
);
// scan the next label in the argument string
Label
next_label
(
char
*&
,
bool
extended
=
true
);
// store the alphabet in the argument file (in binary form)
void
store
(
FILE
*
)
const
;
// read the alphabet from the argument file
void
read
(
FILE
*
);
// disambiguation and printing of analyses
int
compute_score
(
Analysis
&
ana
);
void
disambiguate
(
std
::
vector
<
Analysis
>
&
analyses
);
char
*
print_analysis
(
Analysis
&
ana
,
bool
both_layers
);
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
,
const
Alphabet
&
);
};
// write the alphabet to the output stream (in readable form)
std
::
ostream
&
operator
<<
(
std
::
ostream
&
,
const
Alphabet
&
);
}
#endif
src/main/resources/SMOR/src/alphabet.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/basic.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE basic.C */
/* MODULE basic */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE */
/* */
/*******************************************************************/
#include <stdlib.h>
#include <string.h>
#include "basic.h"
namespace
SFST
{
bool
Switch_Bytes
=
false
;
/*******************************************************************/
/* */
/* fst_strdup */
/* */
/*******************************************************************/
char
*
fst_strdup
(
const
char
*
pString
)
{
char
*
pStringCopy
=
(
char
*
)
malloc
(
strlen
(
pString
)
+
1
);
if
(
pStringCopy
==
NULL
)
{
fprintf
(
stderr
,
"
\n
Error: out of memory (malloc failed)
\n
aborted.
\n
"
);
exit
(
1
);
}
strcpy
(
pStringCopy
,
pString
);
return
pStringCopy
;
}
/*******************************************************************/
/* */
/* read_string */
/* */
/*******************************************************************/
int
read_string
(
char
*
buffer
,
int
size
,
FILE
*
file
)
{
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
int
c
=
fgetc
(
file
);
if
(
c
==
EOF
||
c
==
0
)
{
buffer
[
i
]
=
0
;
return
(
c
==
0
);
}
buffer
[
i
]
=
(
char
)
c
;
}
buffer
[
size
-
1
]
=
0
;
return
0
;
}
/*******************************************************************/
/* */
/* read_num */
/* */
/*******************************************************************/
size_t
read_num
(
void
*
p
,
size_t
n
,
FILE
*
file
)
{
char
*
pp
=
(
char
*
)
p
;
size_t
result
=
fread
(
pp
,
1
,
n
,
file
);
if
(
Switch_Bytes
)
{
size_t
e
=
n
/
2
;
for
(
size_t
i
=
0
;
i
<
e
;
i
++
)
{
char
tmp
=
pp
[
i
];
pp
[
i
]
=
pp
[
--
n
];
pp
[
n
]
=
tmp
;
}
}
return
result
;
}
}
src/main/resources/SMOR/src/basic.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE basic.h */
/* MODULE basic */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE */
/* */
/*******************************************************************/
#ifndef _BASIC_H_
#define _BASIC_H_
#include <stdio.h>
namespace
SFST
{
extern
bool
Switch_Bytes
;
char
*
fst_strdup
(
const
char
*
pString
);
int
read_string
(
char
*
buffer
,
int
size
,
FILE
*
file
);
size_t
read_num
(
void
*
p
,
size_t
size
,
FILE
*
file
);
}
#endif
src/main/resources/SMOR/src/basic.o
0 → 100755
View file @
0b611765
File added
src/main/resources/SMOR/src/compact.C
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE compact.C */
/* MODULE compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE Code needed for analysing data */
/* */
/*******************************************************************/
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include "compact.h"
namespace
SFST
{
using
std
::
equal_range
;
using
std
::
vector
;
using
std
::
pair
;
class
label_less
{
public:
bool
operator
()(
const
Label
l1
,
const
Label
l2
)
const
{
return
l1
.
upper_char
()
<
l2
.
upper_char
();
}
};
const
int
BUFFER_SIZE
=
1000
;
/*******************************************************************/
/* */
/* CompactTransducer::convert */
/* */
/*******************************************************************/
void
CompactTransducer
::
convert
(
CAnalysis
&
cana
,
Analysis
&
ana
)
{
ana
.
resize
(
cana
.
size
());
for
(
size_t
i
=
0
;
i
<
cana
.
size
();
i
++
)
ana
[
i
]
=
label
[
cana
[
i
]];
}
/*******************************************************************/
/* */
/* CompactTransducer::analyze */
/* */
/*******************************************************************/
void
CompactTransducer
::
analyze
(
unsigned
int
n
,
vector
<
Character
>
&
input
,
size_t
ipos
,
CAnalysis
&
ca
,
vector
<
CAnalysis
>
&
analyses
)
{
// "n" is the number of the current transducer node/state
// "input" is the sequence of input symbols
// "ipos" is the input position currently analysed
// "ca" stores the incomplete analysis string
// "analyses" stores the analyses found so far
if
(
analyses
.
size
()
>
10000
)
return
;
// limit the maximal number of analyses
// Is the input string fully analyzed and the current node a final node?
if
(
finalp
[
n
]
&&
ipos
==
input
.
size
())
// store the new analysis
analyses
.
push_back
(
ca
);
// follow the epsilon transitions
// first_arc[n] is the number of the first outgoing transition of node n
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
// first_arc[n+1] is the number of the first outgoing transition of node n+1
unsigned
int
i
;
for
(
i
=
first_arc
[
n
];
i
<
first_arc
[
n
+
1
]
&&
label
[
i
].
upper_char
()
==
Label
::
epsilon
;
i
++
)
{
ca
.
push_back
(
i
);
analyze
(
target_node
[
i
],
input
,
ipos
,
ca
,
analyses
);
ca
.
pop_back
();
}
// follow the non-epsilon transitions
// scan the next input symbol
if
(
ipos
<
input
.
size
())
{
// find the set of arcs with matching upper character in the sorted list
pair
<
Label
*
,
Label
*>
range
=
equal_range
(
label
+
i
,
label
+
first_arc
[
n
+
1
],
Label
(
input
[
ipos
]),
label_less
());
unsigned
int
to
=
(
unsigned
int
)(
range
.
second
-
label
);
// follow the non-epsilon transitions
for
(
i
=
(
unsigned
)(
range
.
first
-
label
);
i
<
to
;
i
++
)
{
ca
.
push_back
(
i
);
analyze
(
target_node
[
i
],
input
,
ipos
+
1
,
ca
,
analyses
);
ca
.
pop_back
();
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::analyze_string */
/* */
/*******************************************************************/
void
CompactTransducer
::
analyze_string
(
char
*
s
,
vector
<
CAnalysis
>
&
analyses
)
{
// "s" input string to be analyzed
// "analyses" is the data structure in which the results are stored
// and returned
vector
<
Character
>
input
;
alphabet
.
string2symseq
(
s
,
input
);
analyses
.
clear
();
CAnalysis
ca
;
// data structure where the current incomplete analysis
// is stored
analyze
(
0
,
input
,
0
,
ca
,
analyses
);
// start the analysis
if
(
analyses
.
size
()
>
10000
)
fprintf
(
stderr
,
"Warning: Only the first 10000 analyses considered for
\"
%s
\"
!
\n
"
,
s
);
if
(
simplest_only
&&
analyses
.
size
()
>
1
)
disambiguate
(
analyses
);
// select the simplest analyses
}
/*******************************************************************/
/* */
/* CompactTransducer::~CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer
::~
CompactTransducer
()
{
delete
[]
finalp
;
delete
[]
first_arc
;
delete
[]
label
;
delete
[]
target_node
;
delete
[]
final_logprob
;
delete
[]
arc_logprob
;
}
/*******************************************************************/
/* */
/* CompactTransducer::CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer
::
CompactTransducer
()
{
both_layers
=
false
;
simplest_only
=
false
;
number_of_nodes
=
0
;
number_of_arcs
=
0
;
finalp
=
NULL
;
first_arc
=
NULL
;
label
=
NULL
;
target_node
=
NULL
;
arc_logprob
=
final_logprob
=
(
float
*
)
NULL
;
}
/*******************************************************************/
/* */
/* CompactTransducer::read_finalp */
/* */
/*******************************************************************/
void
CompactTransducer
::
read_finalp
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
char
n
=
0
;
for
(
size_t
i
=
0
;
i
<
number_of_nodes
;
i
++
)
{
if
(
k
==
0
)
{
n
=
(
unsigned
char
)
fgetc
(
file
);
k
=
8
;
}
k
--
;
if
(
n
&
(
1
<<
k
))
finalp
[
i
]
=
1
;
else
finalp
[
i
]
=
0
;
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_first_arcs */
/* */
/*******************************************************************/
void
CompactTransducer
::
read_first_arcs
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
int
n
=
0
;
int
bits
=
(
int
)
ceil
(
log
(
number_of_arcs
+
1
)
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<=
number_of_nodes
;
i
++
)
{
first_arc
[
i
]
=
n
>>
(
sizeof
(
n
)
*
8
-
bits
);
n
<<=
bits
;
k
-=
bits
;
if
(
k
<
0
)
{
read_num
(
&
n
,
sizeof
(
n
),
file
);
first_arc
[
i
]
|=
n
>>
(
sizeof
(
n
)
*
8
+
k
);
n
<<=
-
k
;
k
+=
(
int
)
sizeof
(
n
)
*
8
;
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_target_nodes */
/* */
/*******************************************************************/
void
CompactTransducer
::
read_target_nodes
(
FILE
*
file
)
{
int
k
=
0
;
unsigned
int
n
=
0
;
int
bits
=
(
int
)
ceil
(
log
(
number_of_nodes
)
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<
number_of_arcs
;
i
++
)
{
target_node
[
i
]
=
n
>>
(
sizeof
(
n
)
*
8
-
bits
);
n
<<=
bits
;
k
-=
bits
;
if
(
k
<
0
)
{
read_num
(
&
n
,
sizeof
(
n
),
file
);
target_node
[
i
]
|=
n
>>
(
sizeof
(
n
)
*
8
+
k
);
n
<<=
-
k
;
k
+=
(
int
)
sizeof
(
n
)
*
8
;
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_labels */
/* */
/*******************************************************************/
void
CompactTransducer
::
read_labels
(
FILE
*
file
)
{
size_t
N
=
0
;
Label
Num2Label
[
alphabet
.
size
()];
for
(
Alphabet
::
const_iterator
it
=
alphabet
.
begin
();
it
!=
alphabet
.
end
();
it
++
)
{
Label
l
=*
it
;
Num2Label
[
N
++
]
=
l
;
}
int
k
=
0
;
unsigned
int
n
=
0
;
int
bits
=
(
int
)
ceil
(
log
((
double
)
alphabet
.
size
())
/
log
(
2
));
for
(
size_t
i
=
0
;
i
<
number_of_arcs
;
i
++
)
{
unsigned
int
l
=
n
>>
(
sizeof
(
n
)
*
8
-
bits
);
n
<<=
bits
;
k
-=
bits
;
if
(
k
<
0
)
{
read_num
(
&
n
,
sizeof
(
n
),
file
);
l
|=
n
>>
(
sizeof
(
n
)
*
8
+
k
);
n
<<=
-
k
;
k
+=
(
int
)
sizeof
(
n
)
*
8
;
}
label
[
i
]
=
Num2Label
[
l
];
}
}
/*******************************************************************/
/* */
/* CompactTransducer::read_probs */
/* */
/*******************************************************************/
void
CompactTransducer
::
read_probs
(
FILE
*
file
)
{
size_t
n
,
m
;
fread
(
&
n
,
sizeof
(
n
),
1
,
file
);
if
(
fread
(
&
m
,
sizeof
(
n
),
1
,
file
)
!=
1
||
n
!=
node_count
()
||
m
!=
arc_count
())
{
fprintf
(
stderr
,
"Error: incompatible probability file!
\n
"
);
exit
(
1
);
}
final_logprob
=
new
float
[
n
];
arc_logprob
=
new
float
[
m
];
fread
(
final_logprob
,
sizeof
(
float
),
n
,
file
);
if
(
fread
(
arc_logprob
,
sizeof
(
float
),
n
,
file
)
!=
n
)
{
fprintf
(
stderr
,
"Error: in probability file!
\n
"
);
exit
(
1
);
}
}
/*******************************************************************/
/* */
/* CompactTransducer::CompactTransducer */
/* */
/*******************************************************************/
CompactTransducer
::
CompactTransducer
(
FILE
*
file
,
FILE
*
pfile
)
{
both_layers
=
false
;
simplest_only
=
false
;
if
(
fgetc
(
file
)
!=
'c'
)
throw
"Error: wrong file format (not a compact transducer)
\n
"
;
alphabet
.
read
(
file
);
read_num
(
&
number_of_nodes
,
sizeof
(
number_of_nodes
),
file
);
read_num
(
&
number_of_arcs
,
sizeof
(
number_of_arcs
),
file
);
if
(
!
ferror
(
file
))
{
// memory allocation
finalp
=
new
char
[
number_of_nodes
];
first_arc
=
new
unsigned
[
number_of_nodes
+
1
];
label
=
new
Label
[
number_of_arcs
];
target_node
=
new
unsigned
[
number_of_arcs
];
// reading the data
read_finalp
(
file
);
read_first_arcs
(
file
);
read_labels
(
file
);
read_target_nodes
(
file
);
}
if
(
pfile
==
NULL
)
arc_logprob
=
final_logprob
=
(
float
*
)
NULL
;
else
read_probs
(
pfile
);
}
/*******************************************************************/
/* */
/* CompactTransducer::longest_match2 */
/* */
/*******************************************************************/
void
CompactTransducer
::
longest_match2
(
unsigned
int
n
,
char
*
string
,
int
l
,
CAnalysis
&
ca
,
int
&
bl
,
CAnalysis
&
ba
)
{
// n: transducer state
// string: rest string
// l: length of current analysis
// bl: length of the currently longest match
// ca: current analysis
// ba: best analysis
if
(
finalp
[
n
]
&&
l
>
bl
)
{
// store the new analysis
bl
=
l
;
ba
=
ca
;
// copy the arc vector
}
// follow the epsilon transitions
unsigned
int
i
;
for
(
i
=
first_arc
[
n
];
i
<
first_arc
[
n
+
1
]
&&
label
[
i
].
upper_char
()
==
Label
::
epsilon
;
i
++
)
{
ca
.
push_back
(
i
);
longest_match2
(
target_node
[
i
],
string
,
l
,
ca
,
bl
,
ba
);
ca
.
pop_back
();
}
// follow the non-epsilon transitions
char
*
end
=
string
;
int
c
=
alphabet
.
next_code
(
end
,
false
,
false
);
l
+=
(
int
)(
end
-
string
);
if
(
c
!=
EOF
)
{
// find the set of arcs with matching upper character in the sort list
pair
<
Label
*
,
Label
*>
range
=
equal_range
(
label
+
i
,
label
+
first_arc
[
n
+
1
],
Label
((
Character
)
c
),
label_less
());
unsigned
int
to
=
(
unsigned
int
)(
range
.
second
-
label
);
for
(
i
=
(
unsigned
)(
range
.
first
-
label
);
i
<
to
;
i
++
)
{
ca
.
push_back
(
i
);
longest_match2
(
target_node
[
i
],
end
,
l
,
ca
,
bl
,
ba
);
ca
.
pop_back
();
}
}
}
/*******************************************************************/
/* */
/* CompactTransducer::print_analysis */
/* */
/*******************************************************************/
char
*
CompactTransducer
::
print_analysis
(
CAnalysis
&
cana
)
{
Analysis
ana
;
convert
(
cana
,
ana
);
return
alphabet
.
print_analysis
(
ana
,
both_layers
);
}
/*******************************************************************/
/* */
/* CompactTransducer::longest_match */
/* */
/*******************************************************************/
const
char
*
CompactTransducer
::
longest_match
(
char
*
&
string
)
{
vector
<
char
>
analysis
;
CAnalysis
ca
,
ba
;
int
l
=
0
;
longest_match2
(
0
,
string
,
0
,
ca
,
l
,
ba
);
// no match? return the next character
if
(
ba
.
size
()
==
0
)
{
int
c
=
alphabet
.
next_code
(
string
,
false
,
false
);
return
alphabet
.
code2symbol
((
Character
)
c
);
}
string
+=
l
;
return
print_analysis
(
ba
);
}
/*******************************************************************/
/* */
/* CompactTransducer::disambiguate */
/* */
/*******************************************************************/
void
CompactTransducer
::
disambiguate
(
vector
<
CAnalysis
>
&
analyses
)
{
// compute the scores
int
bestscore
=
INT_MIN
;
vector
<
int
>
score
;
Analysis
ana
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
convert
(
analyses
[
i
],
ana
);
score
.
push_back
(
alphabet
.
compute_score
(
ana
));
if
(
bestscore
<
score
[
i
])
bestscore
=
score
[
i
];
}
// delete suboptimal analyses
size_t
k
=
0
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
if
(
score
[
i
]
==
bestscore
)
analyses
[
k
++
]
=
analyses
[
i
];
analyses
.
resize
(
k
);
}
/*******************************************************************/
/* */
/* CompactTransducer::train2 */
/* */
/*******************************************************************/
bool
CompactTransducer
::
train2
(
char
*
s
,
vector
<
double
>
&
arcfreq
,
vector
<
double
>
&
finalfreq
)
{
vector
<
CAnalysis
>
analyses
;
vector
<
Label
>
input
;
alphabet
.
string2labelseq
(
s
,
input
);
CAnalysis
ca
;
// data structure where the analysis is stored
unsigned
int
n
=
0
;
bool
failure
=
false
;
for
(
size_t
i
=
0
;
i
<
input
.
size
();
i
++
)
{
failure
=
true
;
for
(
unsigned
int
k
=
first_arc
[
n
];
k
<
first_arc
[
n
+
1
];
k
++
)
{
if
(
label
[
k
]
==
input
[
i
])
{
ca
.
push_back
(
k
);
n
=
target_node
[
k
];
failure
=
false
;
break
;
}
}
if
(
failure
)
break
;
}
if
(
failure
||
!
finalp
[
n
])
{
fprintf
(
stderr
,
"Warning: The following input is not covered:
\n
%s
\n
"
,
s
);
return
false
;
}
for
(
size_t
k
=
0
;
k
<
ca
.
size
();
k
++
)
arcfreq
[
ca
[
k
]]
++
;
finalfreq
[
target_node
[
ca
.
back
()]]
++
;
return
true
;
}
/*******************************************************************/
/* */
/* CompactTransducer::train */
/* */
/*******************************************************************/
bool
CompactTransducer
::
train
(
char
*
s
,
vector
<
double
>
&
arcfreq
,
vector
<
double
>
&
finalfreq
)
{
vector
<
CAnalysis
>
analyses
;
vector
<
Character
>
input
;
alphabet
.
string2symseq
(
s
,
input
);
CAnalysis
ca
;
// data structure where the current incomplete analysis
// is stored
analyze
(
0
,
input
,
0
,
ca
,
analyses
);
// start the analysis
if
(
analyses
.
size
()
>
10000
)
return
true
;
// ignore inputs with more than 10000 analyses
else
if
(
analyses
.
size
()
==
0
)
return
false
;
if
(
simplest_only
&&
analyses
.
size
()
>
1
)
disambiguate
(
analyses
);
// select the simplest analyses
if
(
analyses
.
size
()
>
0
)
{
double
incr
=
1
.
0
/
(
double
)
analyses
.
size
();
CAnalysis
arcs
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
CAnalysis
&
arcs
=
analyses
[
i
];
for
(
size_t
k
=
0
;
k
<
arcs
.
size
();
k
++
)
arcfreq
[
arcs
[
k
]]
+=
incr
;
finalfreq
[
target_node
[
arcs
.
back
()]]
+=
incr
;
}
}
return
true
;
}
/*******************************************************************/
/* */
/* CompactTransducer::estimate_probs */
/* */
/*******************************************************************/
void
CompactTransducer
::
estimate_probs
(
vector
<
double
>
&
arcfreq
,
vector
<
double
>
&
finalfreq
)
{
// turn frequencies into probabilities
for
(
size_t
n
=
0
;
n
<
finalfreq
.
size
();
n
++
)
{
double
sum
=
finalfreq
[
n
];
for
(
size_t
a
=
first_arc
[
n
];
a
<
first_arc
[
n
+
1
];
a
++
)
sum
+=
arcfreq
[
a
];
if
(
sum
==
0
.
0
)
sum
=
1
.
0
;
finalfreq
[
n
]
=
finalfreq
[
n
]
/
sum
;
for
(
size_t
a
=
first_arc
[
n
];
a
<
first_arc
[
n
+
1
];
a
++
)
arcfreq
[
a
]
=
arcfreq
[
a
]
/
sum
;
}
}
/*******************************************************************/
/* */
/* CompactTransducer::compute_probs */
/* */
/*******************************************************************/
void
CompactTransducer
::
compute_probs
(
vector
<
CAnalysis
>
&
analyses
,
vector
<
double
>
&
prob
)
{
prob
.
resize
(
analyses
.
size
());
double
sum
=
0
.
0
;
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
CAnalysis
&
a
=
analyses
[
i
];
// compute the probability
double
logprob
=
0
.
0
;
for
(
size_t
k
=
0
;
k
<
a
.
size
();
k
++
)
logprob
+=
arc_logprob
[
a
[
k
]];
logprob
+=
final_logprob
[
target_node
[
a
.
back
()]];
prob
[
i
]
=
exp
(
logprob
);
sum
+=
prob
[
i
];
}
// sort the analyses
vector
<
CAnalysis
>
oldanalyses
(
analyses
);
vector
<
double
>
oldprob
(
prob
);
for
(
size_t
i
=
0
;
i
<
analyses
.
size
();
i
++
)
{
prob
[
i
]
=
-
1
.
0
;
size_t
n
=
0
;
for
(
size_t
k
=
0
;
k
<
oldanalyses
.
size
();
k
++
)
if
(
prob
[
i
]
<
oldprob
[
k
])
{
prob
[
i
]
=
oldprob
[
k
];
n
=
k
;
}
analyses
[
i
]
=
oldanalyses
[
n
];
oldprob
[
n
]
=
-
1
.
0
;
prob
[
i
]
/=
sum
;
// normalization
}
}
}
src/main/resources/SMOR/src/compact.h
0 → 100755
View file @
0b611765
/*******************************************************************/
/* */
/* FILE compact.h */
/* MODULE compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _COMPACT_H_
#define _COMPACT_H_
#include "alphabet.h"
#include <vector>
namespace
SFST
{
typedef
std
::
vector
<
unsigned
int
>
CAnalysis
;
class
CompactTransducer
{
protected:
// the following data structures are used to store the nodes
unsigned
int
number_of_nodes
;
// number of nodes in the transducer
char
*
finalp
;
// finalp[i] is 1 if node i is final and 0 otherwise
unsigned
int
*
first_arc
;
// first_arc[i] is the number of the first
// arc outgoing from node i
// the following data structures are used to store the transition arcs
unsigned
int
number_of_arcs
;
// total number of arcs in the transducer
Label
*
label
;
// the label (character pair) of arc i
unsigned
int
*
target_node
;
// target node of arc i
// the following data structures are used to store the stochastic parameters
float
*
final_logprob
;
float
*
arc_logprob
;
// functions needed to read the transducer from a file
void
read_finalp
(
FILE
*
file
);
void
read_first_arcs
(
FILE
*
file
);
void
read_target_nodes
(
FILE
*
file
);
void
read_labels
(
FILE
*
file
);
void
read_probs
(
FILE
*
file
);
// functions needed to analyze data with the transducer
void
analyze
(
unsigned
int
n
,
std
::
vector
<
Character
>
&
ch
,
size_t
ipos
,
CAnalysis
&
,
std
::
vector
<
CAnalysis
>&
);
// function selecting the simplest morphological analysis
int
compute_score
(
CAnalysis
&
ana
);
void
disambiguate
(
std
::
vector
<
CAnalysis
>
&
analyses
);
// functions for longest-match analysis of input data
void
longest_match2
(
unsigned
int
,
char
*
,
int
,
CAnalysis
&
,
int
&
,
CAnalysis
&
);
void
convert
(
CAnalysis
&
cana
,
Analysis
&
ana
);
public:
size_t
node_count
()
{
return
number_of_nodes
;
};
size_t
arc_count
()
{
return
number_of_arcs
;
};
bool
both_layers
;
// print surface and analysis symbols
bool
simplest_only
;
// print only the simplest analyses
Alphabet
alphabet
;
// data structure which maps symbols to numeric codes
CompactTransducer
();
// dummy constructor
CompactTransducer
(
FILE
*
,
FILE
*
pfile
=
NULL
);
// reads a (stochastic) transducer
~
CompactTransducer
();
// destroys a transducer
// the analysis function returns the set of analyses for the string "s"
// in the argument "analyses"
void
analyze_string
(
char
*
s
,
std
::
vector
<
CAnalysis
>
&
analyses
);
void
compute_probs
(
std
::
vector
<
CAnalysis
>
&
analyses
,
std
::
vector
<
double
>
&
prob
);
char
*
print_analysis
(
CAnalysis
&
ana
);
// longest-match analysis
const
char
*
longest_match
(
char
*&
);
// EM training
bool
train2
(
char
*
s
,
std
::
vector
<
double
>
&
arcfreq
,
std
::
vector
<
double
>
&
finalfreq
);
bool
train
(
char
*
s
,
std
::
vector
<
double
>
&
arcfreq
,
std
::
vector
<
double
>
&
finalfreq
);
void
estimate_probs
(
std
::
vector
<
double
>
&
arcfreq
,
std
::
vector
<
double
>
&
finalfreq
);
// robust analysis
float
robust_analyze_string
(
char
*
string
,
std
::
vector
<
CAnalysis
>
&
analyses
,
float
ErrorsAllowed
);
};
}
#endif
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment