forked from GitHub/gf-core
DictEngFin: a translation dictionary based on Finnish wordnet, Kotus, and frequency list. Not yet quite functional.
This commit is contained in:
39731
lib/src/finnish/DictEngFin.gf
Normal file
39731
lib/src/finnish/DictEngFin.gf
Normal file
File diff suppressed because it is too large
Load Diff
@@ -188,6 +188,7 @@ oper
|
||||
mkV2 : V -> V2 ; -- direct transitive
|
||||
mkV2 : V -> Case -> V2 ; -- complement just case
|
||||
mkV2 : V -> Prep -> V2 ; -- complement pre/postposition
|
||||
mkV2 : VK -> V2 ; -- direct transitive of Kotus verb
|
||||
} ;
|
||||
|
||||
|
||||
@@ -607,6 +608,7 @@ oper
|
||||
mkV2 : V -> V2 = dirV2 ;
|
||||
mkV2 : V -> Case -> V2 = caseV2 ;
|
||||
mkV2 : V -> Prep -> V2 = mk2V2 ;
|
||||
mkV2 : VK -> V2 = \w -> dirV2 (vforms2V w.s ** {sc = NPCase Nom ; lock_V = <>}) ;
|
||||
} ;
|
||||
|
||||
mk2V2 : V -> Prep -> V2 ;
|
||||
|
||||
97
lib/src/finnish/ParseFin.gf
Normal file
97
lib/src/finnish/ParseFin.gf
Normal file
@@ -0,0 +1,97 @@
|
||||
--# -path=alltenses:.:../english
|
||||
concrete ParseFin of ParseEngAbs =
|
||||
TenseX, ---- - [Pol, PNeg, PPos],
|
||||
CatFin,
|
||||
NounFin,
|
||||
AdjectiveFin,
|
||||
NumeralFin,
|
||||
SymbolFin [PN, Symb, MkSymb, SymbPN],
|
||||
ConjunctionFin,
|
||||
VerbFin - [SlashV2V, PassV2, UseCopula],
|
||||
AdverbFin,
|
||||
PhraseFin,
|
||||
SentenceFin,
|
||||
RelativeFin,
|
||||
IdiomFin [NP, VP, Tense, Cl, ProgrVP, ExistNP],
|
||||
-- ExtraFin [NP, Quant, VPSlash, VP, Tense, GenNP, PassVPSlash],
|
||||
DictEngFin **
|
||||
open MorphoFin, ResFin, ParadigmsFin, Prelude in {
|
||||
|
||||
flags literal=Symb ;
|
||||
{-
|
||||
lin
|
||||
myself_NP = regNP "myself" singular ;
|
||||
yourselfSg_NP = regNP "yourself" singular ;
|
||||
himself_NP = regNP "himself" singular ;
|
||||
herself_NP = regNP "herself" singular ;
|
||||
itself_NP = regNP "itself" singular ;
|
||||
ourself_NP = regNP "ourself" plural ;
|
||||
yourselfPl_NP = regNP "yourself" plural ;
|
||||
themself_NP = regNP "themself" plural ;
|
||||
themselves_NP = regNP "themselves" plural ;
|
||||
|
||||
CompoundCN num noun cn = {
|
||||
s = \\n,c => num.s ! Nom ++ noun.s ! num.n ! Nom ++ cn.s ! n ! c ;
|
||||
g = cn.g
|
||||
} ;
|
||||
|
||||
DashCN noun1 noun2 = {
|
||||
s = \\n,c => noun1.s ! Sg ! Nom ++ "-" ++ noun2.s ! n ! c ;
|
||||
g = noun2.g
|
||||
} ;
|
||||
|
||||
GerundN v = {
|
||||
s = \\n,c => v.s ! VPresPart ;
|
||||
g = Neutr
|
||||
} ;
|
||||
|
||||
GerundAP v = {
|
||||
s = \\agr => v.s ! VPresPart ;
|
||||
isPre = True
|
||||
} ;
|
||||
|
||||
PastPartAP v = {
|
||||
s = \\agr => v.s ! VPPart ;
|
||||
isPre = True
|
||||
} ;
|
||||
|
||||
OrdCompar a = {s = \\c => a.s ! AAdj Compar c } ;
|
||||
|
||||
PositAdVAdj a = {s = a.s ! AAdv} ;
|
||||
|
||||
UseQuantPN q pn = {s = \\c => q.s ! False ! Sg ++ pn.s ! npcase2case c ; a = agrgP3 Sg pn.g} ;
|
||||
|
||||
SlashV2V v p vp = insertObjc (\\a => p.s ++ case p.p of {CPos => ""; _ => "not"} ++
|
||||
v.c3 ++
|
||||
infVP v.typ vp a)
|
||||
(predVc v) ;
|
||||
|
||||
ComplPredVP np vp = {
|
||||
s = \\t,a,b,o =>
|
||||
let
|
||||
verb = vp.s ! t ! a ! b ! o ! np.a ;
|
||||
compl = vp.s2 ! np.a
|
||||
in
|
||||
case o of {
|
||||
ODir => compl ++ "," ++ np.s ! npNom ++ verb.aux ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf ;
|
||||
OQuest => verb.aux ++ compl ++ "," ++ np.s ! npNom ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf
|
||||
}
|
||||
} ;
|
||||
|
||||
that_RP = {
|
||||
s = \\_ => "that" ;
|
||||
a = RNoAg
|
||||
} ;
|
||||
no_RP = {
|
||||
s = \\_ => "" ;
|
||||
a = RNoAg
|
||||
} ;
|
||||
|
||||
CompS s = {s = \\_ => "that" ++ s.s} ;
|
||||
CompVP vp = {s = \\a => infVP VVInf vp a} ;
|
||||
|
||||
lin
|
||||
PPos = {s = [] ; p = CPos} ;
|
||||
PNeg = {s = [] ; p = CNeg True} ; -- contracted: don't
|
||||
-}
|
||||
}
|
||||
4596
lib/src/finnish/wordnet/Adven_fi.txt
Normal file
4596
lib/src/finnish/wordnet/Adven_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
29016
lib/src/finnish/wordnet/Aen_fi.txt
Normal file
29016
lib/src/finnish/wordnet/Aen_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
95
lib/src/finnish/wordnet/FreqFin.hs
Normal file
95
lib/src/finnish/wordnet/FreqFin.hs
Normal file
@@ -0,0 +1,95 @@
|
||||
import Data.Map
|
||||
import Data.List
|
||||
import Data.Char
|
||||
|
||||
-- a script for extracting an English-Finnish translation dictionary from
|
||||
-- (1) Eng-Fin wordnet links
|
||||
-- (2) Fin frequency dictionary
|
||||
-- (3) Fin KOTUS morpho wordlist
|
||||
-- usage: runghc FreqFin.hs >DictEngFin.gf (with appropriate files in place)
|
||||
-- AR 23/5/2012
|
||||
|
||||
main = do
|
||||
freqs <- readFile "taajuus.txt" >>= return . getFreqMap
|
||||
morpho <- readFile "DictFin.gf" >>= return . getMorphoMap
|
||||
transV <- readFile "Ven_fi.txt" >>= return . getTransDict "V" freqs morpho
|
||||
transV2 <- readFile "V2en_fi.txt" >>= return . getTransDict "V2" freqs morpho
|
||||
transA <- readFile "Aen_fi.txt" >>= return . getTransDict "A" freqs morpho
|
||||
transN <- readFile "Nen_fi.txt" >>= return . getTransDict "N" freqs morpho
|
||||
transAdv <- readFile "Adven_fi.txt" >>= return . getTransDict "Adv" freqs morpho
|
||||
let cnc = sort $ lmap mkLin $ transV ++ transV2 ++ transA ++ transN ++ transAdv
|
||||
mapM_ putStrLn cnc
|
||||
|
||||
getFreqMap = fromList . lmap (getFreq . words) . lines
|
||||
|
||||
lmap = Prelude.map
|
||||
mlookup = Data.Map.lookup
|
||||
lnull = Prelude.null
|
||||
|
||||
|
||||
type FreqMap = Map Word (Rank,Cat)
|
||||
type Rank = Int
|
||||
type Cat = String
|
||||
type Word = String
|
||||
type Lin = String
|
||||
|
||||
getFreq :: [String] -> (Word,(Rank,Cat))
|
||||
getFreq ws = case ws of
|
||||
n:a:r:w:c:_ -> (w,(read n,c))
|
||||
|
||||
|
||||
type MorphoMap = Map Word (Cat,Lin)
|
||||
|
||||
getMorphoMap = fromList . concat . lmap (getMorpho . words) . lines
|
||||
|
||||
getMorpho ws = case ws of
|
||||
"lin":w:_:vs -> [(fst (wordcat w), (snd (wordcat w), unwords (init vs)))]
|
||||
_ -> []
|
||||
where
|
||||
wordcat w = let (wd,c) = break (=='_') w in (wd, init (tail c))
|
||||
|
||||
type TransDict = [(Word,(Cat,[(Word,(Rank,Lin))]))]
|
||||
|
||||
getTransDict :: Cat -> FreqMap -> MorphoMap -> String -> TransDict
|
||||
getTransDict cat freqs morpho = lmap getOne . lmap (lmap words) . stanzas . lines
|
||||
where
|
||||
getOne ls@((w:_):_) = (w,(cat, sortTrans cat [getRank vs | _:vs <- ls]))
|
||||
getRank (v:[]) = case (mlookup v freqs, mlookup v morpho) of
|
||||
(Just (i,c), Just (k,l)) | compatCat cat c && compatCat cat k -> (v, (i, lin l))
|
||||
(Just (i,c), _) | compatCat cat c -> (v, (i, lin ("\"" ++ v ++ "\"")))
|
||||
(_, Just (c,l)) | compatCat cat c -> (v, (morphoRank, lin l))
|
||||
_ | all isLetter (take 1 v) -> (v,(guessRank,lin ("\"" ++ v ++ "\"")))
|
||||
_ -> (v,(noRank,lin v))
|
||||
getRank vs = (unwords vs, (compRank,lin (unwords vs)))
|
||||
|
||||
lin l = "mk" ++ cat ++ " " ++ l
|
||||
|
||||
sortTrans :: Cat -> [(Word,(Rank,Lin))] -> [(Word,(Rank,Lin))]
|
||||
sortTrans cat = chooseBest . sortBy (\ (_,(r,_)) (_,(s,_)) -> compare r s) where
|
||||
chooseBest = take 1 ----
|
||||
|
||||
compatCat cat c = case cat of
|
||||
"V2" -> c == "V"
|
||||
_ -> c == cat
|
||||
|
||||
morphoRank, guessRank, noRank, compRank :: Int
|
||||
morphoRank = 10000
|
||||
guessRank = 20000
|
||||
noRank = 30000
|
||||
compRank = 40000
|
||||
|
||||
mkLin :: (Word,(Cat,[(Word,(Rank,Lin))])) -> String
|
||||
mkLin (word,(cat,ws)) = unwords $ [keyw,fun,"=",lin,";"] where
|
||||
fun = lmap clean word ++ "_" ++ cat
|
||||
(keyw,lin) = case ws of
|
||||
(w,(r,l)):_ | r < noRank -> ("lin", l)
|
||||
(w,_):_ -> ("-- lin", "\"" ++ w ++ "\"") ---- look inside non-freq words
|
||||
clean c = case c of
|
||||
'-' -> '_'
|
||||
_ -> c
|
||||
|
||||
stanzas :: [String] -> [[String]]
|
||||
stanzas ls = case ls of
|
||||
[]:ls2 -> stanzas ls2
|
||||
_:_ -> let (ls1,ls2) = span (not . lnull) ls in ls1 : stanzas ls2
|
||||
[] -> []
|
||||
190
lib/src/finnish/wordnet/LUEMINUT_fst
Normal file
190
lib/src/finnish/wordnet/LUEMINUT_fst
Normal file
@@ -0,0 +1,190 @@
|
||||
|
||||
FinnWordNetin sanastoon perustuvat HFST-transduktorit
|
||||
=====================================================
|
||||
|
||||
|
||||
Tämä paketti sisältää FinnWordNetin sanastodatan pohjalta luotuja
|
||||
HFST-transduktoreita, joita voi käyttää (taivuttavina) suomen tai
|
||||
englannin synonyymisanastoina tai käännössanakirjoina.
|
||||
|
||||
|
||||
FinnWordNet
|
||||
-----------
|
||||
|
||||
FinnWordNet on suomen wordnet. Se on luotu käännättämällä
|
||||
ammattikääntäjillä alkuperäisen englanninkielen Princeton WordNetin
|
||||
(PWN) version 3.0 sanat (sananmerkitykset) suomeksi ja yhdistämällä
|
||||
käännökset PWN:n rakenteeseen. FinnWordNet on osa
|
||||
FIN-CLARIN-infrastruktuurihanketta:
|
||||
|
||||
http://www.ling.helsinki.fi/finclarin/
|
||||
|
||||
Lisätietoja FinnWordNetistä saa FinnWordNet-projektin WWW-sivulta:
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/finnwordnet/
|
||||
|
||||
|
||||
HFST – Helsinki Finite-State Transducer Technology
|
||||
--------------------------------------------------
|
||||
|
||||
Lisätietoa HFST:stä (englanniksi) saa projektin WWW-sivulta:
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
|
||||
|
||||
FinnWordNetin transduktorit ovat HFST:n optimized lookup -muodossa:
|
||||
|
||||
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
|
||||
|
||||
Transduktoritiedostoissa on pääte .hfstol. Niiden käyttäminen
|
||||
edellyttää joko HFST-kirjastoa ja -työkaluja (versiota 3.2.0 tai
|
||||
uudempaa) tai erillistä HFST optimized lookup -ohjelmaa, jolla niitä
|
||||
voi ajaa:
|
||||
|
||||
http://sourceforge.net/projects/hfst/files/optimized-lookup/
|
||||
|
||||
Transduktorit tarvitsevat optimized lookup -ohjelman version 1.3 (tai
|
||||
uudemman) tai Java-toteutuksen (hfst-ol.jar, 2011-05-23 tai uudempi);
|
||||
ne eivät toimi Python-toteutuksella (2011-05-24).
|
||||
|
||||
|
||||
FinnWordNetin transduktoripaketit
|
||||
---------------------------------
|
||||
|
||||
FinnWordNetin transduktorit on jaettu kolmeen pakettiin, joista
|
||||
jokaisessa on hiukan erilaiset transduktorit (YYYYMMDD viittaa paketin
|
||||
julkaisupäivään):
|
||||
|
||||
fiwnsyn-fi-YYYYMMDD.zip – Suomen synonyymisanastot
|
||||
|
||||
fiwnsyn-en-YYYYMMDD.zip - Englannin synonyymisanastot (perustuvat
|
||||
Princeton WordNetiin)
|
||||
|
||||
fiwntransl-YYYYMMDD.zip - Suomi–englanti ja englanti–suomi
|
||||
käännössanakirjat
|
||||
|
||||
Tämä LUEMINUT-tiedosto on yhteinen kaikille paketeille.
|
||||
|
||||
Synonyymisanastotransduktorien nimet ovat muotoa
|
||||
fiwnsyn-KL-TYYPPI.hfstol, missä KL on kielikoodi ”fi” tai ”en” ja
|
||||
TYYPPI on yksi seuraavista:
|
||||
|
||||
infl – Transduktori tunnistaa syötesanan taivutusmuodon ja tuottaa
|
||||
synonyymit samassa taivutusmuodossa. Transduktori ei tunnista
|
||||
ei tuota monisanaisia synonyymeja. Sanaa ei lasketa itsensä
|
||||
synonyymiksi.
|
||||
|
||||
infl-refl – Sama kuin edellä, mutta synonymia on refleksiivistä:
|
||||
sana lasketaan itsensä synonyymiksi. Tämä mahdollistaa
|
||||
syötesanan mahdollisten vaihtoehtoisten taivutusmuotojen
|
||||
tuottamisen, kuten ”omenoiden”, ”omenoitten”, ”omenien”,
|
||||
”omenojen”, ”omenain”.
|
||||
|
||||
noinfl - Transduktori tunnistaa syötesanan taivutetuissa
|
||||
muodoissa, mutta tuottaa synonyymit perusmuodoissaan.
|
||||
Englannin transduktorit tunnistavat ja tuottavat myös
|
||||
monisanaiset ilmaukset ja suomen transduktorit tuottavat.
|
||||
Sanaa ei lasketa itsensä synonyymiksi.
|
||||
|
||||
noinfl-refl – Sama kuin edellä, mutta synonymia on refleksiivistä.
|
||||
|
||||
Käännössanakirjatransduktoritiedostojen nimet ovat
|
||||
fiwntransl-fien.hfstol (suomi–englanti) ja fiwntransl-enfi.hfstol
|
||||
(englanti–suomi). Ne tunnistavat syötesanan taivutettuja muotoja,
|
||||
mutta tuottavat käännökset perusmuodossa. Englanti–suomi-sanakirja
|
||||
sekä tunnistaa että tuottaa monisanaisia ilmauksia, kun taas
|
||||
suomi–englanti-sanakirja vain tuottaa niitä.
|
||||
|
||||
|
||||
Lähteitä
|
||||
--------
|
||||
|
||||
FinnWordNetin ja Princeton WordNetin datan lisäksi transduktorien
|
||||
tekemisessä on käytetty Omorfia, suomen avointa morfologista työkalua
|
||||
(http://gna.org/projects/omorfi), ja HFST:n englannin morfologiaa
|
||||
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
|
||||
joka on alunperin Måns Huldenin Princeton WordNetin datan pohjalta
|
||||
tekemä.
|
||||
|
||||
|
||||
Puutteita
|
||||
---------
|
||||
|
||||
* Monisanaisten ilmausten käsittely on osittain epäyhtenäistä.
|
||||
|
||||
* Suomen synonyymisanastot, erityisesti taivuttavat sanastot,
|
||||
tuottavat usein monia identtisiä tulostesanoja.
|
||||
|
||||
* Englannin taivuttava synonyymisanasto yligeneroi joitain
|
||||
sananmuotoja, kuten virheellisen kaksinkertaisen monikon genetiivin
|
||||
(”nets’s”) oikean (”nets’”) lisäksi.
|
||||
|
||||
* Ei-taivuttava englannin synonyymisanasto ja englanti–suomi-sanakirja
|
||||
tunnistavat taivutuksen monisanaisen ilmauksen viimeisessä sanassa,
|
||||
vaikka olisi oikein taivuttaa jotain aiempaa sanaa. Ne tunnistavat
|
||||
esimerkiksi ”arrive ated” oikean muodon ”arrived at” sijaan.
|
||||
|
||||
* Monitulkintaisen tai monimerkityksisen sananmuodon kaikki synonyymit
|
||||
tai käännökset luetellaan yhdessä, ilman järjestystä tai ryhmittelyä
|
||||
sanaluokan tai sananmerkityksen mukaan.
|
||||
|
||||
|
||||
Lisenssi
|
||||
--------
|
||||
|
||||
Koska FinnWordNet käyttää Princeton WordNetin rakennetta ja
|
||||
merkitysten selitteitä, se on PWN:n johdannainen ja siten PWN:n
|
||||
lisenssin alainen:
|
||||
|
||||
http://wordnet.princeton.edu/wordnet/license/
|
||||
|
||||
PWN:n lisenssi sallii vapaan käytön, myös kaupallisesti, kunhan sen
|
||||
käyttämisestä ja tekijänoikeuksista kerrotaan:
|
||||
|
||||
WordNet Release 3.0 This software and database is being provided
|
||||
to you, the LICENSEE, by Princeton University under the following
|
||||
license. By obtaining, using and/or copying this software and
|
||||
database, you agree that you have read, understood, and will
|
||||
comply with these terms and conditions.: Permission to use, copy,
|
||||
modify and distribute this software and database and its
|
||||
documentation for any purpose and without fee or royalty is hereby
|
||||
granted, provided that you agree to comply with the following
|
||||
copyright notice and statements, including the disclaimer, and
|
||||
that the same appear on ALL copies of the software, database and
|
||||
documentation, including modifications that you make for internal
|
||||
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
|
||||
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
|
||||
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
|
||||
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
|
||||
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
|
||||
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
|
||||
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
|
||||
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||
University or Princeton may not be used in advertising or
|
||||
publicity pertaining to distribution of the software and/or
|
||||
database. Title to copyright in this software, database and any
|
||||
associated documentation shall at all times remain with Princeton
|
||||
University and LICENSEE agrees to preserve same.
|
||||
|
||||
FinnWordNetin sisältämien sanojen suomenkielisten käännösten
|
||||
tekijänoikeudet ovat Helsinin yliopistolla. Ne lisensoidaan Creative
|
||||
Commons Nimeä (CC BY) 3.0 -lisenssillä, joka on samantapainen kuin
|
||||
PWN:n lisenssi:
|
||||
|
||||
http://creativecommons.org/licenses/by/3.0/deed.fi
|
||||
|
||||
Kun viittaat FinnWordNetiin, viittaa seuraavaan artikkeliin:
|
||||
|
||||
Krister Lindén and Lauri Carlson. 2010. FinnWordNet – WordNet på
|
||||
finska via översättning. LexicoNordica – Nordic Journal of
|
||||
Lexicography, 17:119–140.
|
||||
|
||||
|
||||
Yhteystiedot
|
||||
------------
|
||||
|
||||
FinnWordNet-projektia johtaa tutkimusjohtaja, FT Krister Lindén
|
||||
Helsingin yliopiston nykykielten laitoksessa (kieliteknologian
|
||||
oppiaineessa). Teknisissä kysymyksissä yhteyshenkilönä on
|
||||
projektitutkija Jyrki Niemi. Sähköpostiosoitteet ovat muotoa
|
||||
etunimi.sukunimi@helsinki.fi (aksentit poistettuina).
|
||||
77526
lib/src/finnish/wordnet/Nen_fi.txt
Normal file
77526
lib/src/finnish/wordnet/Nen_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
195
lib/src/finnish/wordnet/README_fst
Normal file
195
lib/src/finnish/wordnet/README_fst
Normal file
@@ -0,0 +1,195 @@
|
||||
|
||||
HFST transducers based on FinnWordNet dictionary data
|
||||
=====================================================
|
||||
|
||||
|
||||
This package contains various HFST transducers based on FinnWordNet
|
||||
lexical data. The transducers can be used as (inflecting) Finnish or
|
||||
English thesauri or translation dictionaries.
|
||||
|
||||
|
||||
FinnWordNet
|
||||
-----------
|
||||
|
||||
FinnWordNet is a wordnet for Finnish. It was created by having
|
||||
professional translators translate the word senses of the Princeton
|
||||
WordNet (PWN) 3.0 into Finnish and by combining the translations with
|
||||
the PWN structure. FinnWordNet is a part of the FIN-CLARIN project:
|
||||
|
||||
http://www.ling.helsinki.fi/finclarin/
|
||||
|
||||
For more information about FinnWordNet, please visit the FinnWordNet
|
||||
project Web page
|
||||
|
||||
http://www.ling.helsinki.fi/en/lt/research/finnwordnet/
|
||||
|
||||
|
||||
HFST – Helsinki Finite-State Transducer Technology
|
||||
--------------------------------------------------
|
||||
|
||||
For more information about HFST, please see the project Web page
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
|
||||
|
||||
The FinnWordNet transducers use the HFST optimized lookup format:
|
||||
|
||||
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
|
||||
|
||||
The transducer files have the suffix .hfstol. Using them requires
|
||||
either the HFST library and tools (version 3.2.0 or later) or the
|
||||
standalone HFST optimized lookup program with which they can be run
|
||||
(applied):
|
||||
|
||||
http://sourceforge.net/projects/hfst/files/optimized-lookup/
|
||||
|
||||
The transducers require version 1.3 or later of the standalone
|
||||
optimized lookup or the Java implementation (hfst-ol.jar as of
|
||||
2011-05-23); the do not work with the Python implementation of
|
||||
2011-05-24.
|
||||
|
||||
|
||||
FinnWordNet transducer packages
|
||||
-------------------------------
|
||||
|
||||
The FinnWordNet transducers are divided into three packages, each with
|
||||
a few slightly different transducers (YYYYMMDD denotes the release
|
||||
date of the package):
|
||||
|
||||
fiwnsyn-fi-YYYYMMDD.zip – Finnish thesauri
|
||||
|
||||
fiwnsyn-en-YYYYMMDD.zip - English thesauri (based on the Princeton
|
||||
WordNet)
|
||||
|
||||
fiwntransl-YYYYMMDD.zip - Finnish–English and English–Finnish
|
||||
translation dictionaries.
|
||||
|
||||
This README file is common to all the packages.
|
||||
|
||||
The names of the thesaurus transducer files have the form
|
||||
fiwnsyn-LG-TYPE.hfstol, where LG is the language code ‘fi’ or ‘en’ and
|
||||
TYPE is one of the following:
|
||||
|
||||
infl – The transducer recognizes inflected forms of the input word
|
||||
and generates synonyms with the same form. Multi-word synonyms
|
||||
are not recognized nor generated. A word is not considered its
|
||||
own synonym.
|
||||
|
||||
infl-refl – The same as above but synonymy is reflexive: a word is
|
||||
considered its own synonym. This makes it possible to generate
|
||||
alternative forms of the input word, such as ‘indices’ and
|
||||
‘indexes’.
|
||||
|
||||
noinfl - The transducer recognizes inflected forms of the input
|
||||
word but generates synonyms in their base form. Multi-word
|
||||
synonyms are recognized and generated for English and
|
||||
generated for Finnish. A word is not considered its own
|
||||
synonym.
|
||||
|
||||
noinfl-refl – The same as above but synonymy is reflexive.
|
||||
|
||||
The names of the translation dictionary transducer files are
|
||||
fiwntransl-fien.hfstol for the Finnish–English dictionary and
|
||||
fiwntransl-enfi.hfstol for the English–Finnish one. They recognize
|
||||
inflected forms of the input word but generate the base form of the
|
||||
translation. The English–Finnish dictionary recognizes and generates
|
||||
multi-word translations, whereas the Finnish–English one only
|
||||
generates them.
|
||||
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
In addition to the FinnWordNet and Princeton WordNet data, the
|
||||
transducers have been constructed using the Omorfi open morphology
|
||||
tool for Finnish (http://gna.org/projects/omorfi) and the HFST English
|
||||
morphology
|
||||
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
|
||||
originally by Måns Hulden, based on Princeton WordNet data.
|
||||
|
||||
|
||||
Deficiencies
|
||||
------------
|
||||
|
||||
* Multi-word expressions are handled somewhat inconsistently.
|
||||
|
||||
* The Finnish thesauri, in particular the inflecting ones, often
|
||||
generate many identical output words.
|
||||
|
||||
* The inflecting English thesaurus overgenerates some word forms, such
|
||||
as an incorrect double plural genitive (‘nets’s’) in addition to the
|
||||
correct one (‘nets’’).
|
||||
|
||||
* The non-inflecting English thesauri and the English–Finnish
|
||||
dictionary recognize inflection in the last word of a multi-word
|
||||
expression, even if it would be correct to inflect a preceding word.
|
||||
For example, they recognize ‘arrive ated’ but not the correct
|
||||
‘arrived at’.
|
||||
|
||||
* All the synonyms or translations of an ambiguous or polysemous word
|
||||
form are listed together, without any sorting or grouping according
|
||||
to the part of speech or word sense.
|
||||
|
||||
|
||||
Licence
|
||||
-------
|
||||
|
||||
Since FinnWordNet retains the structure and glosses of Princeton
|
||||
WordNet, it is a derivative of PWN subject to the PWN licence:
|
||||
|
||||
http://wordnet.princeton.edu/wordnet/license/
|
||||
|
||||
The PWN licence allows free use, including commercial use, provided
|
||||
that a copyright notice is given:
|
||||
|
||||
WordNet Release 3.0 This software and database is being provided
|
||||
to you, the LICENSEE, by Princeton University under the following
|
||||
license. By obtaining, using and/or copying this software and
|
||||
database, you agree that you have read, understood, and will
|
||||
comply with these terms and conditions.: Permission to use, copy,
|
||||
modify and distribute this software and database and its
|
||||
documentation for any purpose and without fee or royalty is hereby
|
||||
granted, provided that you agree to comply with the following
|
||||
copyright notice and statements, including the disclaimer, and
|
||||
that the same appear on ALL copies of the software, database and
|
||||
documentation, including modifications that you make for internal
|
||||
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
|
||||
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
|
||||
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
|
||||
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
|
||||
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
|
||||
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
|
||||
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
|
||||
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||
University or Princeton may not be used in advertising or
|
||||
publicity pertaining to distribution of the software and/or
|
||||
database. Title to copyright in this software, database and any
|
||||
associated documentation shall at all times remain with Princeton
|
||||
University and LICENSEE agrees to preserve same.
|
||||
|
||||
The translations of FinnWordNet are copyright of the University of
|
||||
Helsinki and they are licenced under Creative Commons Attribution (CC
|
||||
BY) 3.0, which is similar to the PWN licence:
|
||||
|
||||
http://creativecommons.org/licenses/by/3.0/
|
||||
|
||||
Please cite the following paper when referring to FinnWordNet:
|
||||
|
||||
Krister Lindén and Lauri Carlson. 2010. FinnWordNet – WordNet på
|
||||
finska via översättning. LexicoNordica – Nordic Journal of
|
||||
Lexicography, 17:119–140.
|
||||
|
||||
HFST is licenced under the GNU Lesser General Public License, version
|
||||
3.0:
|
||||
|
||||
http://www.gnu.org/licenses/lgpl.html
|
||||
|
||||
|
||||
Contact
|
||||
-------
|
||||
|
||||
The FinnWordNet project is led by Dr Krister Lindén at the Department
|
||||
of Modern Languages (Language Technology) of the University of
|
||||
Helsinki. In technical questions, please contact Mr Jyrki Niemi. Email
|
||||
addresses are of the form firstname.lastname@helsinki.fi (accents
|
||||
removed).
|
||||
29576
lib/src/finnish/wordnet/V2en_fi.txt
Normal file
29576
lib/src/finnish/wordnet/V2en_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
2929
lib/src/finnish/wordnet/VSen_fi.txt
Normal file
2929
lib/src/finnish/wordnet/VSen_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
19849
lib/src/finnish/wordnet/Ven_fi.txt
Normal file
19849
lib/src/finnish/wordnet/Ven_fi.txt
Normal file
File diff suppressed because it is too large
Load Diff
9996
lib/src/finnish/wordnet/taajuus.txt
Normal file
9996
lib/src/finnish/wordnet/taajuus.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user