1
0
forked from GitHub/gf-core

DictEngFin: a translation dictionary based on Finnish wordnet, Kotus, and frequency list. Not yet quite functional.

This commit is contained in:
aarne
2012-05-23 15:46:12 +00:00
parent 842b321ce9
commit 6e6d2611dc
13 changed files with 213798 additions and 0 deletions

39731
lib/src/finnish/DictEngFin.gf Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -188,6 +188,7 @@ oper
mkV2 : V -> V2 ; -- direct transitive
mkV2 : V -> Case -> V2 ; -- complement just case
mkV2 : V -> Prep -> V2 ; -- complement pre/postposition
mkV2 : VK -> V2 ; -- direct transitive of Kotus verb
} ;
@@ -607,6 +608,7 @@ oper
mkV2 : V -> V2 = dirV2 ;
mkV2 : V -> Case -> V2 = caseV2 ;
mkV2 : V -> Prep -> V2 = mk2V2 ;
mkV2 : VK -> V2 = \w -> dirV2 (vforms2V w.s ** {sc = NPCase Nom ; lock_V = <>}) ;
} ;
mk2V2 : V -> Prep -> V2 ;

View File

@@ -0,0 +1,97 @@
--# -path=alltenses:.:../english
concrete ParseFin of ParseEngAbs =
TenseX, ---- - [Pol, PNeg, PPos],
CatFin,
NounFin,
AdjectiveFin,
NumeralFin,
SymbolFin [PN, Symb, MkSymb, SymbPN],
ConjunctionFin,
VerbFin - [SlashV2V, PassV2, UseCopula],
AdverbFin,
PhraseFin,
SentenceFin,
RelativeFin,
IdiomFin [NP, VP, Tense, Cl, ProgrVP, ExistNP],
-- ExtraFin [NP, Quant, VPSlash, VP, Tense, GenNP, PassVPSlash],
DictEngFin **
open MorphoFin, ResFin, ParadigmsFin, Prelude in {
flags literal=Symb ;
{-
lin
myself_NP = regNP "myself" singular ;
yourselfSg_NP = regNP "yourself" singular ;
himself_NP = regNP "himself" singular ;
herself_NP = regNP "herself" singular ;
itself_NP = regNP "itself" singular ;
ourself_NP = regNP "ourself" plural ;
yourselfPl_NP = regNP "yourself" plural ;
themself_NP = regNP "themself" plural ;
themselves_NP = regNP "themselves" plural ;
CompoundCN num noun cn = {
s = \\n,c => num.s ! Nom ++ noun.s ! num.n ! Nom ++ cn.s ! n ! c ;
g = cn.g
} ;
DashCN noun1 noun2 = {
s = \\n,c => noun1.s ! Sg ! Nom ++ "-" ++ noun2.s ! n ! c ;
g = noun2.g
} ;
GerundN v = {
s = \\n,c => v.s ! VPresPart ;
g = Neutr
} ;
GerundAP v = {
s = \\agr => v.s ! VPresPart ;
isPre = True
} ;
PastPartAP v = {
s = \\agr => v.s ! VPPart ;
isPre = True
} ;
OrdCompar a = {s = \\c => a.s ! AAdj Compar c } ;
PositAdVAdj a = {s = a.s ! AAdv} ;
UseQuantPN q pn = {s = \\c => q.s ! False ! Sg ++ pn.s ! npcase2case c ; a = agrgP3 Sg pn.g} ;
SlashV2V v p vp = insertObjc (\\a => p.s ++ case p.p of {CPos => ""; _ => "not"} ++
v.c3 ++
infVP v.typ vp a)
(predVc v) ;
ComplPredVP np vp = {
s = \\t,a,b,o =>
let
verb = vp.s ! t ! a ! b ! o ! np.a ;
compl = vp.s2 ! np.a
in
case o of {
ODir => compl ++ "," ++ np.s ! npNom ++ verb.aux ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf ;
OQuest => verb.aux ++ compl ++ "," ++ np.s ! npNom ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf
}
} ;
that_RP = {
s = \\_ => "that" ;
a = RNoAg
} ;
no_RP = {
s = \\_ => "" ;
a = RNoAg
} ;
CompS s = {s = \\_ => "that" ++ s.s} ;
CompVP vp = {s = \\a => infVP VVInf vp a} ;
lin
PPos = {s = [] ; p = CPos} ;
PNeg = {s = [] ; p = CNeg True} ; -- contracted: don't
-}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,95 @@
import Data.Map
import Data.List
import Data.Char
-- a script for extracting an English-Finnish translation dictionary from
-- (1) Eng-Fin wordnet links
-- (2) Fin frequency dictionary
-- (3) Fin KOTUS morpho wordlist
-- usage: runghc FreqFin.hs >DictEngFin.gf (with appropriate files in place)
-- AR 23/5/2012
main = do
freqs <- readFile "taajuus.txt" >>= return . getFreqMap
morpho <- readFile "DictFin.gf" >>= return . getMorphoMap
transV <- readFile "Ven_fi.txt" >>= return . getTransDict "V" freqs morpho
transV2 <- readFile "V2en_fi.txt" >>= return . getTransDict "V2" freqs morpho
transA <- readFile "Aen_fi.txt" >>= return . getTransDict "A" freqs morpho
transN <- readFile "Nen_fi.txt" >>= return . getTransDict "N" freqs morpho
transAdv <- readFile "Adven_fi.txt" >>= return . getTransDict "Adv" freqs morpho
let cnc = sort $ lmap mkLin $ transV ++ transV2 ++ transA ++ transN ++ transAdv
mapM_ putStrLn cnc
getFreqMap = fromList . lmap (getFreq . words) . lines
lmap = Prelude.map
mlookup = Data.Map.lookup
lnull = Prelude.null
type FreqMap = Map Word (Rank,Cat)
type Rank = Int
type Cat = String
type Word = String
type Lin = String
getFreq :: [String] -> (Word,(Rank,Cat))
getFreq ws = case ws of
n:a:r:w:c:_ -> (w,(read n,c))
type MorphoMap = Map Word (Cat,Lin)
getMorphoMap = fromList . concat . lmap (getMorpho . words) . lines
getMorpho ws = case ws of
"lin":w:_:vs -> [(fst (wordcat w), (snd (wordcat w), unwords (init vs)))]
_ -> []
where
wordcat w = let (wd,c) = break (=='_') w in (wd, init (tail c))
type TransDict = [(Word,(Cat,[(Word,(Rank,Lin))]))]
getTransDict :: Cat -> FreqMap -> MorphoMap -> String -> TransDict
getTransDict cat freqs morpho = lmap getOne . lmap (lmap words) . stanzas . lines
where
getOne ls@((w:_):_) = (w,(cat, sortTrans cat [getRank vs | _:vs <- ls]))
getRank (v:[]) = case (mlookup v freqs, mlookup v morpho) of
(Just (i,c), Just (k,l)) | compatCat cat c && compatCat cat k -> (v, (i, lin l))
(Just (i,c), _) | compatCat cat c -> (v, (i, lin ("\"" ++ v ++ "\"")))
(_, Just (c,l)) | compatCat cat c -> (v, (morphoRank, lin l))
_ | all isLetter (take 1 v) -> (v,(guessRank,lin ("\"" ++ v ++ "\"")))
_ -> (v,(noRank,lin v))
getRank vs = (unwords vs, (compRank,lin (unwords vs)))
lin l = "mk" ++ cat ++ " " ++ l
sortTrans :: Cat -> [(Word,(Rank,Lin))] -> [(Word,(Rank,Lin))]
sortTrans cat = chooseBest . sortBy (\ (_,(r,_)) (_,(s,_)) -> compare r s) where
chooseBest = take 1 ----
compatCat cat c = case cat of
"V2" -> c == "V"
_ -> c == cat
morphoRank, guessRank, noRank, compRank :: Int
morphoRank = 10000
guessRank = 20000
noRank = 30000
compRank = 40000
mkLin :: (Word,(Cat,[(Word,(Rank,Lin))])) -> String
mkLin (word,(cat,ws)) = unwords $ [keyw,fun,"=",lin,";"] where
fun = lmap clean word ++ "_" ++ cat
(keyw,lin) = case ws of
(w,(r,l)):_ | r < noRank -> ("lin", l)
(w,_):_ -> ("-- lin", "\"" ++ w ++ "\"") ---- look inside non-freq words
clean c = case c of
'-' -> '_'
_ -> c
stanzas :: [String] -> [[String]]
stanzas ls = case ls of
[]:ls2 -> stanzas ls2
_:_ -> let (ls1,ls2) = span (not . lnull) ls in ls1 : stanzas ls2
[] -> []

View File

@@ -0,0 +1,190 @@
FinnWordNetin sanastoon perustuvat HFST-transduktorit
=====================================================
Tämä paketti sisältää FinnWordNetin sanastodatan pohjalta luotuja
HFST-transduktoreita, joita voi käyttää (taivuttavina) suomen tai
englannin synonyymisanastoina tai käännössanakirjoina.
FinnWordNet
-----------
FinnWordNet on suomen wordnet. Se on luotu käännättämällä
ammattikääntäjillä alkuperäisen englanninkielen Princeton WordNetin
(PWN) version 3.0 sanat (sananmerkitykset) suomeksi ja yhdistämällä
käännökset PWN:n rakenteeseen. FinnWordNet on osa
FIN-CLARIN-infrastruktuurihanketta:
http://www.ling.helsinki.fi/finclarin/
Lisätietoja FinnWordNetistä saa FinnWordNet-projektin WWW-sivulta:
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/finnwordnet/
HFST Helsinki Finite-State Transducer Technology
--------------------------------------------------
Lisätietoa HFST:stä (englanniksi) saa projektin WWW-sivulta:
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
FinnWordNetin transduktorit ovat HFST:n optimized lookup -muodossa:
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
Transduktoritiedostoissa on pääte .hfstol. Niiden käyttäminen
edellyttää joko HFST-kirjastoa ja -työkaluja (versiota 3.2.0 tai
uudempaa) tai erillistä HFST optimized lookup -ohjelmaa, jolla niitä
voi ajaa:
http://sourceforge.net/projects/hfst/files/optimized-lookup/
Transduktorit tarvitsevat optimized lookup -ohjelman version 1.3 (tai
uudemman) tai Java-toteutuksen (hfst-ol.jar, 2011-05-23 tai uudempi);
ne eivät toimi Python-toteutuksella (2011-05-24).
FinnWordNetin transduktoripaketit
---------------------------------
FinnWordNetin transduktorit on jaettu kolmeen pakettiin, joista
jokaisessa on hiukan erilaiset transduktorit (YYYYMMDD viittaa paketin
julkaisupäivään):
fiwnsyn-fi-YYYYMMDD.zip Suomen synonyymisanastot
fiwnsyn-en-YYYYMMDD.zip - Englannin synonyymisanastot (perustuvat
Princeton WordNetiin)
fiwntransl-YYYYMMDD.zip - Suomienglanti ja englantisuomi
käännössanakirjat
Tämä LUEMINUT-tiedosto on yhteinen kaikille paketeille.
Synonyymisanastotransduktorien nimet ovat muotoa
fiwnsyn-KL-TYYPPI.hfstol, missä KL on kielikoodi ”fi” tai ”en” ja
TYYPPI on yksi seuraavista:
infl Transduktori tunnistaa syötesanan taivutusmuodon ja tuottaa
synonyymit samassa taivutusmuodossa. Transduktori ei tunnista
ei tuota monisanaisia synonyymeja. Sanaa ei lasketa itsensä
synonyymiksi.
infl-refl Sama kuin edellä, mutta synonymia on refleksiivistä:
sana lasketaan itsensä synonyymiksi. Tämä mahdollistaa
syötesanan mahdollisten vaihtoehtoisten taivutusmuotojen
tuottamisen, kuten ”omenoiden”, ”omenoitten”, ”omenien”,
”omenojen”, ”omenain”.
noinfl - Transduktori tunnistaa syötesanan taivutetuissa
muodoissa, mutta tuottaa synonyymit perusmuodoissaan.
Englannin transduktorit tunnistavat ja tuottavat myös
monisanaiset ilmaukset ja suomen transduktorit tuottavat.
Sanaa ei lasketa itsensä synonyymiksi.
noinfl-refl Sama kuin edellä, mutta synonymia on refleksiivistä.
Käännössanakirjatransduktoritiedostojen nimet ovat
fiwntransl-fien.hfstol (suomienglanti) ja fiwntransl-enfi.hfstol
(englantisuomi). Ne tunnistavat syötesanan taivutettuja muotoja,
mutta tuottavat käännökset perusmuodossa. Englantisuomi-sanakirja
sekä tunnistaa että tuottaa monisanaisia ilmauksia, kun taas
suomienglanti-sanakirja vain tuottaa niitä.
Lähteitä
--------
FinnWordNetin ja Princeton WordNetin datan lisäksi transduktorien
tekemisessä on käytetty Omorfia, suomen avointa morfologista työkalua
(http://gna.org/projects/omorfi), ja HFST:n englannin morfologiaa
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
joka on alunperin Måns Huldenin Princeton WordNetin datan pohjalta
tekemä.
Puutteita
---------
* Monisanaisten ilmausten käsittely on osittain epäyhtenäistä.
* Suomen synonyymisanastot, erityisesti taivuttavat sanastot,
tuottavat usein monia identtisiä tulostesanoja.
* Englannin taivuttava synonyymisanasto yligeneroi joitain
sananmuotoja, kuten virheellisen kaksinkertaisen monikon genetiivin
(”netss”) oikean (”nets”) lisäksi.
* Ei-taivuttava englannin synonyymisanasto ja englantisuomi-sanakirja
tunnistavat taivutuksen monisanaisen ilmauksen viimeisessä sanassa,
vaikka olisi oikein taivuttaa jotain aiempaa sanaa. Ne tunnistavat
esimerkiksi ”arrive ated” oikean muodon ”arrived at” sijaan.
* Monitulkintaisen tai monimerkityksisen sananmuodon kaikki synonyymit
tai käännökset luetellaan yhdessä, ilman järjestystä tai ryhmittelyä
sanaluokan tai sananmerkityksen mukaan.
Lisenssi
--------
Koska FinnWordNet käyttää Princeton WordNetin rakennetta ja
merkitysten selitteitä, se on PWN:n johdannainen ja siten PWN:n
lisenssin alainen:
http://wordnet.princeton.edu/wordnet/license/
PWN:n lisenssi sallii vapaan käytön, myös kaupallisesti, kunhan sen
käyttämisestä ja tekijänoikeuksista kerrotaan:
WordNet Release 3.0 This software and database is being provided
to you, the LICENSEE, by Princeton University under the following
license. By obtaining, using and/or copying this software and
database, you agree that you have read, understood, and will
comply with these terms and conditions.: Permission to use, copy,
modify and distribute this software and database and its
documentation for any purpose and without fee or royalty is hereby
granted, provided that you agree to comply with the following
copyright notice and statements, including the disclaimer, and
that the same appear on ALL copies of the software, database and
documentation, including modifications that you make for internal
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
University or Princeton may not be used in advertising or
publicity pertaining to distribution of the software and/or
database. Title to copyright in this software, database and any
associated documentation shall at all times remain with Princeton
University and LICENSEE agrees to preserve same.
FinnWordNetin sisältämien sanojen suomenkielisten käännösten
tekijänoikeudet ovat Helsinin yliopistolla. Ne lisensoidaan Creative
Commons Nimeä (CC BY) 3.0 -lisenssillä, joka on samantapainen kuin
PWN:n lisenssi:
http://creativecommons.org/licenses/by/3.0/deed.fi
Kun viittaat FinnWordNetiin, viittaa seuraavaan artikkeliin:
Krister Lindén and Lauri Carlson. 2010. FinnWordNet WordNet på
finska via översättning. LexicoNordica Nordic Journal of
Lexicography, 17:119140.
Yhteystiedot
------------
FinnWordNet-projektia johtaa tutkimusjohtaja, FT Krister Lindén
Helsingin yliopiston nykykielten laitoksessa (kieliteknologian
oppiaineessa). Teknisissä kysymyksissä yhteyshenkilönä on
projektitutkija Jyrki Niemi. Sähköpostiosoitteet ovat muotoa
etunimi.sukunimi@helsinki.fi (aksentit poistettuina).

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,195 @@
HFST transducers based on FinnWordNet dictionary data
=====================================================
This package contains various HFST transducers based on FinnWordNet
lexical data. The transducers can be used as (inflecting) Finnish or
English thesauri or translation dictionaries.
FinnWordNet
-----------
FinnWordNet is a wordnet for Finnish. It was created by having
professional translators translate the word senses of the Princeton
WordNet (PWN) 3.0 into Finnish and by combining the translations with
the PWN structure. FinnWordNet is a part of the FIN-CLARIN project:
http://www.ling.helsinki.fi/finclarin/
For more information about FinnWordNet, please visit the FinnWordNet
project Web page
http://www.ling.helsinki.fi/en/lt/research/finnwordnet/
HFST Helsinki Finite-State Transducer Technology
--------------------------------------------------
For more information about HFST, please see the project Web page
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
The FinnWordNet transducers use the HFST optimized lookup format:
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
The transducer files have the suffix .hfstol. Using them requires
either the HFST library and tools (version 3.2.0 or later) or the
standalone HFST optimized lookup program with which they can be run
(applied):
http://sourceforge.net/projects/hfst/files/optimized-lookup/
The transducers require version 1.3 or later of the standalone
optimized lookup or the Java implementation (hfst-ol.jar as of
2011-05-23); the do not work with the Python implementation of
2011-05-24.
FinnWordNet transducer packages
-------------------------------
The FinnWordNet transducers are divided into three packages, each with
a few slightly different transducers (YYYYMMDD denotes the release
date of the package):
fiwnsyn-fi-YYYYMMDD.zip Finnish thesauri
fiwnsyn-en-YYYYMMDD.zip - English thesauri (based on the Princeton
WordNet)
fiwntransl-YYYYMMDD.zip - FinnishEnglish and EnglishFinnish
translation dictionaries.
This README file is common to all the packages.
The names of the thesaurus transducer files have the form
fiwnsyn-LG-TYPE.hfstol, where LG is the language code fi or en and
TYPE is one of the following:
infl The transducer recognizes inflected forms of the input word
and generates synonyms with the same form. Multi-word synonyms
are not recognized nor generated. A word is not considered its
own synonym.
infl-refl The same as above but synonymy is reflexive: a word is
considered its own synonym. This makes it possible to generate
alternative forms of the input word, such as indices and
indexes.
noinfl - The transducer recognizes inflected forms of the input
word but generates synonyms in their base form. Multi-word
synonyms are recognized and generated for English and
generated for Finnish. A word is not considered its own
synonym.
noinfl-refl The same as above but synonymy is reflexive.
The names of the translation dictionary transducer files are
fiwntransl-fien.hfstol for the FinnishEnglish dictionary and
fiwntransl-enfi.hfstol for the EnglishFinnish one. They recognize
inflected forms of the input word but generate the base form of the
translation. The EnglishFinnish dictionary recognizes and generates
multi-word translations, whereas the FinnishEnglish one only
generates them.
Sources
-------
In addition to the FinnWordNet and Princeton WordNet data, the
transducers have been constructed using the Omorfi open morphology
tool for Finnish (http://gna.org/projects/omorfi) and the HFST English
morphology
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
originally by Måns Hulden, based on Princeton WordNet data.
Deficiencies
------------
* Multi-word expressions are handled somewhat inconsistently.
* The Finnish thesauri, in particular the inflecting ones, often
generate many identical output words.
* The inflecting English thesaurus overgenerates some word forms, such
as an incorrect double plural genitive (netss) in addition to the
correct one (nets).
* The non-inflecting English thesauri and the EnglishFinnish
dictionary recognize inflection in the last word of a multi-word
expression, even if it would be correct to inflect a preceding word.
For example, they recognize arrive ated but not the correct
arrived at.
* All the synonyms or translations of an ambiguous or polysemous word
form are listed together, without any sorting or grouping according
to the part of speech or word sense.
Licence
-------
Since FinnWordNet retains the structure and glosses of Princeton
WordNet, it is a derivative of PWN subject to the PWN licence:
http://wordnet.princeton.edu/wordnet/license/
The PWN licence allows free use, including commercial use, provided
that a copyright notice is given:
WordNet Release 3.0 This software and database is being provided
to you, the LICENSEE, by Princeton University under the following
license. By obtaining, using and/or copying this software and
database, you agree that you have read, understood, and will
comply with these terms and conditions.: Permission to use, copy,
modify and distribute this software and database and its
documentation for any purpose and without fee or royalty is hereby
granted, provided that you agree to comply with the following
copyright notice and statements, including the disclaimer, and
that the same appear on ALL copies of the software, database and
documentation, including modifications that you make for internal
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
University or Princeton may not be used in advertising or
publicity pertaining to distribution of the software and/or
database. Title to copyright in this software, database and any
associated documentation shall at all times remain with Princeton
University and LICENSEE agrees to preserve same.
The translations of FinnWordNet are copyright of the University of
Helsinki and they are licenced under Creative Commons Attribution (CC
BY) 3.0, which is similar to the PWN licence:
http://creativecommons.org/licenses/by/3.0/
Please cite the following paper when referring to FinnWordNet:
Krister Lindén and Lauri Carlson. 2010. FinnWordNet WordNet på
finska via översättning. LexicoNordica Nordic Journal of
Lexicography, 17:119140.
HFST is licenced under the GNU Lesser General Public License, version
3.0:
http://www.gnu.org/licenses/lgpl.html
Contact
-------
The FinnWordNet project is led by Dr Krister Lindén at the Department
of Modern Languages (Language Technology) of the University of
Helsinki. In technical questions, please contact Mr Jyrki Niemi. Email
addresses are of the form firstname.lastname@helsinki.fi (accents
removed).

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff