forked from GitHub/gf-core
DictEngFin: a translation dictionary based on Finnish wordnet, Kotus, and frequency list. Not yet quite functional.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -188,6 +188,7 @@ oper
|
||||
mkV2 : V -> V2 ; -- direct transitive
|
||||
mkV2 : V -> Case -> V2 ; -- complement just case
|
||||
mkV2 : V -> Prep -> V2 ; -- complement pre/postposition
|
||||
mkV2 : VK -> V2 ; -- direct transitive of Kotus verb
|
||||
} ;
|
||||
|
||||
|
||||
@@ -607,6 +608,7 @@ oper
|
||||
mkV2 : V -> V2 = dirV2 ;
|
||||
mkV2 : V -> Case -> V2 = caseV2 ;
|
||||
mkV2 : V -> Prep -> V2 = mk2V2 ;
|
||||
mkV2 : VK -> V2 = \w -> dirV2 (vforms2V w.s ** {sc = NPCase Nom ; lock_V = <>}) ;
|
||||
} ;
|
||||
|
||||
mk2V2 : V -> Prep -> V2 ;
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
--# -path=alltenses:.:../english
|
||||
concrete ParseFin of ParseEngAbs =
|
||||
TenseX, ---- - [Pol, PNeg, PPos],
|
||||
CatFin,
|
||||
NounFin,
|
||||
AdjectiveFin,
|
||||
NumeralFin,
|
||||
SymbolFin [PN, Symb, MkSymb, SymbPN],
|
||||
ConjunctionFin,
|
||||
VerbFin - [SlashV2V, PassV2, UseCopula],
|
||||
AdverbFin,
|
||||
PhraseFin,
|
||||
SentenceFin,
|
||||
RelativeFin,
|
||||
IdiomFin [NP, VP, Tense, Cl, ProgrVP, ExistNP],
|
||||
-- ExtraFin [NP, Quant, VPSlash, VP, Tense, GenNP, PassVPSlash],
|
||||
DictEngFin **
|
||||
open MorphoFin, ResFin, ParadigmsFin, Prelude in {
|
||||
|
||||
flags literal=Symb ;
|
||||
{-
|
||||
lin
|
||||
myself_NP = regNP "myself" singular ;
|
||||
yourselfSg_NP = regNP "yourself" singular ;
|
||||
himself_NP = regNP "himself" singular ;
|
||||
herself_NP = regNP "herself" singular ;
|
||||
itself_NP = regNP "itself" singular ;
|
||||
ourself_NP = regNP "ourself" plural ;
|
||||
yourselfPl_NP = regNP "yourself" plural ;
|
||||
themself_NP = regNP "themself" plural ;
|
||||
themselves_NP = regNP "themselves" plural ;
|
||||
|
||||
CompoundCN num noun cn = {
|
||||
s = \\n,c => num.s ! Nom ++ noun.s ! num.n ! Nom ++ cn.s ! n ! c ;
|
||||
g = cn.g
|
||||
} ;
|
||||
|
||||
DashCN noun1 noun2 = {
|
||||
s = \\n,c => noun1.s ! Sg ! Nom ++ "-" ++ noun2.s ! n ! c ;
|
||||
g = noun2.g
|
||||
} ;
|
||||
|
||||
GerundN v = {
|
||||
s = \\n,c => v.s ! VPresPart ;
|
||||
g = Neutr
|
||||
} ;
|
||||
|
||||
GerundAP v = {
|
||||
s = \\agr => v.s ! VPresPart ;
|
||||
isPre = True
|
||||
} ;
|
||||
|
||||
PastPartAP v = {
|
||||
s = \\agr => v.s ! VPPart ;
|
||||
isPre = True
|
||||
} ;
|
||||
|
||||
OrdCompar a = {s = \\c => a.s ! AAdj Compar c } ;
|
||||
|
||||
PositAdVAdj a = {s = a.s ! AAdv} ;
|
||||
|
||||
UseQuantPN q pn = {s = \\c => q.s ! False ! Sg ++ pn.s ! npcase2case c ; a = agrgP3 Sg pn.g} ;
|
||||
|
||||
SlashV2V v p vp = insertObjc (\\a => p.s ++ case p.p of {CPos => ""; _ => "not"} ++
|
||||
v.c3 ++
|
||||
infVP v.typ vp a)
|
||||
(predVc v) ;
|
||||
|
||||
ComplPredVP np vp = {
|
||||
s = \\t,a,b,o =>
|
||||
let
|
||||
verb = vp.s ! t ! a ! b ! o ! np.a ;
|
||||
compl = vp.s2 ! np.a
|
||||
in
|
||||
case o of {
|
||||
ODir => compl ++ "," ++ np.s ! npNom ++ verb.aux ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf ;
|
||||
OQuest => verb.aux ++ compl ++ "," ++ np.s ! npNom ++ verb.adv ++ vp.ad ++ verb.fin ++ verb.inf
|
||||
}
|
||||
} ;
|
||||
|
||||
that_RP = {
|
||||
s = \\_ => "that" ;
|
||||
a = RNoAg
|
||||
} ;
|
||||
no_RP = {
|
||||
s = \\_ => "" ;
|
||||
a = RNoAg
|
||||
} ;
|
||||
|
||||
CompS s = {s = \\_ => "that" ++ s.s} ;
|
||||
CompVP vp = {s = \\a => infVP VVInf vp a} ;
|
||||
|
||||
lin
|
||||
PPos = {s = [] ; p = CPos} ;
|
||||
PNeg = {s = [] ; p = CNeg True} ; -- contracted: don't
|
||||
-}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,95 @@
|
||||
import Data.Map
|
||||
import Data.List
|
||||
import Data.Char
|
||||
|
||||
-- a script for extracting an English-Finnish translation dictionary from
|
||||
-- (1) Eng-Fin wordnet links
|
||||
-- (2) Fin frequency dictionary
|
||||
-- (3) Fin KOTUS morpho wordlist
|
||||
-- usage: runghc FreqFin.hs >DictEngFin.gf (with appropriate files in place)
|
||||
-- AR 23/5/2012
|
||||
|
||||
main = do
|
||||
freqs <- readFile "taajuus.txt" >>= return . getFreqMap
|
||||
morpho <- readFile "DictFin.gf" >>= return . getMorphoMap
|
||||
transV <- readFile "Ven_fi.txt" >>= return . getTransDict "V" freqs morpho
|
||||
transV2 <- readFile "V2en_fi.txt" >>= return . getTransDict "V2" freqs morpho
|
||||
transA <- readFile "Aen_fi.txt" >>= return . getTransDict "A" freqs morpho
|
||||
transN <- readFile "Nen_fi.txt" >>= return . getTransDict "N" freqs morpho
|
||||
transAdv <- readFile "Adven_fi.txt" >>= return . getTransDict "Adv" freqs morpho
|
||||
let cnc = sort $ lmap mkLin $ transV ++ transV2 ++ transA ++ transN ++ transAdv
|
||||
mapM_ putStrLn cnc
|
||||
|
||||
getFreqMap = fromList . lmap (getFreq . words) . lines
|
||||
|
||||
lmap = Prelude.map
|
||||
mlookup = Data.Map.lookup
|
||||
lnull = Prelude.null
|
||||
|
||||
|
||||
type FreqMap = Map Word (Rank,Cat)
|
||||
type Rank = Int
|
||||
type Cat = String
|
||||
type Word = String
|
||||
type Lin = String
|
||||
|
||||
getFreq :: [String] -> (Word,(Rank,Cat))
|
||||
getFreq ws = case ws of
|
||||
n:a:r:w:c:_ -> (w,(read n,c))
|
||||
|
||||
|
||||
type MorphoMap = Map Word (Cat,Lin)
|
||||
|
||||
getMorphoMap = fromList . concat . lmap (getMorpho . words) . lines
|
||||
|
||||
getMorpho ws = case ws of
|
||||
"lin":w:_:vs -> [(fst (wordcat w), (snd (wordcat w), unwords (init vs)))]
|
||||
_ -> []
|
||||
where
|
||||
wordcat w = let (wd,c) = break (=='_') w in (wd, init (tail c))
|
||||
|
||||
type TransDict = [(Word,(Cat,[(Word,(Rank,Lin))]))]
|
||||
|
||||
getTransDict :: Cat -> FreqMap -> MorphoMap -> String -> TransDict
|
||||
getTransDict cat freqs morpho = lmap getOne . lmap (lmap words) . stanzas . lines
|
||||
where
|
||||
getOne ls@((w:_):_) = (w,(cat, sortTrans cat [getRank vs | _:vs <- ls]))
|
||||
getRank (v:[]) = case (mlookup v freqs, mlookup v morpho) of
|
||||
(Just (i,c), Just (k,l)) | compatCat cat c && compatCat cat k -> (v, (i, lin l))
|
||||
(Just (i,c), _) | compatCat cat c -> (v, (i, lin ("\"" ++ v ++ "\"")))
|
||||
(_, Just (c,l)) | compatCat cat c -> (v, (morphoRank, lin l))
|
||||
_ | all isLetter (take 1 v) -> (v,(guessRank,lin ("\"" ++ v ++ "\"")))
|
||||
_ -> (v,(noRank,lin v))
|
||||
getRank vs = (unwords vs, (compRank,lin (unwords vs)))
|
||||
|
||||
lin l = "mk" ++ cat ++ " " ++ l
|
||||
|
||||
sortTrans :: Cat -> [(Word,(Rank,Lin))] -> [(Word,(Rank,Lin))]
|
||||
sortTrans cat = chooseBest . sortBy (\ (_,(r,_)) (_,(s,_)) -> compare r s) where
|
||||
chooseBest = take 1 ----
|
||||
|
||||
compatCat cat c = case cat of
|
||||
"V2" -> c == "V"
|
||||
_ -> c == cat
|
||||
|
||||
morphoRank, guessRank, noRank, compRank :: Int
|
||||
morphoRank = 10000
|
||||
guessRank = 20000
|
||||
noRank = 30000
|
||||
compRank = 40000
|
||||
|
||||
mkLin :: (Word,(Cat,[(Word,(Rank,Lin))])) -> String
|
||||
mkLin (word,(cat,ws)) = unwords $ [keyw,fun,"=",lin,";"] where
|
||||
fun = lmap clean word ++ "_" ++ cat
|
||||
(keyw,lin) = case ws of
|
||||
(w,(r,l)):_ | r < noRank -> ("lin", l)
|
||||
(w,_):_ -> ("-- lin", "\"" ++ w ++ "\"") ---- look inside non-freq words
|
||||
clean c = case c of
|
||||
'-' -> '_'
|
||||
_ -> c
|
||||
|
||||
stanzas :: [String] -> [[String]]
|
||||
stanzas ls = case ls of
|
||||
[]:ls2 -> stanzas ls2
|
||||
_:_ -> let (ls1,ls2) = span (not . lnull) ls in ls1 : stanzas ls2
|
||||
[] -> []
|
||||
@@ -0,0 +1,190 @@
|
||||
|
||||
FinnWordNetin sanastoon perustuvat HFST-transduktorit
|
||||
=====================================================
|
||||
|
||||
|
||||
Tämä paketti sisältää FinnWordNetin sanastodatan pohjalta luotuja
|
||||
HFST-transduktoreita, joita voi käyttää (taivuttavina) suomen tai
|
||||
englannin synonyymisanastoina tai käännössanakirjoina.
|
||||
|
||||
|
||||
FinnWordNet
|
||||
-----------
|
||||
|
||||
FinnWordNet on suomen wordnet. Se on luotu käännättämällä
|
||||
ammattikääntäjillä alkuperäisen englanninkielen Princeton WordNetin
|
||||
(PWN) version 3.0 sanat (sananmerkitykset) suomeksi ja yhdistämällä
|
||||
käännökset PWN:n rakenteeseen. FinnWordNet on osa
|
||||
FIN-CLARIN-infrastruktuurihanketta:
|
||||
|
||||
http://www.ling.helsinki.fi/finclarin/
|
||||
|
||||
Lisätietoja FinnWordNetistä saa FinnWordNet-projektin WWW-sivulta:
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/finnwordnet/
|
||||
|
||||
|
||||
HFST – Helsinki Finite-State Transducer Technology
|
||||
--------------------------------------------------
|
||||
|
||||
Lisätietoa HFST:stä (englanniksi) saa projektin WWW-sivulta:
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
|
||||
|
||||
FinnWordNetin transduktorit ovat HFST:n optimized lookup -muodossa:
|
||||
|
||||
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
|
||||
|
||||
Transduktoritiedostoissa on pääte .hfstol. Niiden käyttäminen
|
||||
edellyttää joko HFST-kirjastoa ja -työkaluja (versiota 3.2.0 tai
|
||||
uudempaa) tai erillistä HFST optimized lookup -ohjelmaa, jolla niitä
|
||||
voi ajaa:
|
||||
|
||||
http://sourceforge.net/projects/hfst/files/optimized-lookup/
|
||||
|
||||
Transduktorit tarvitsevat optimized lookup -ohjelman version 1.3 (tai
|
||||
uudemman) tai Java-toteutuksen (hfst-ol.jar, 2011-05-23 tai uudempi);
|
||||
ne eivät toimi Python-toteutuksella (2011-05-24).
|
||||
|
||||
|
||||
FinnWordNetin transduktoripaketit
|
||||
---------------------------------
|
||||
|
||||
FinnWordNetin transduktorit on jaettu kolmeen pakettiin, joista
|
||||
jokaisessa on hiukan erilaiset transduktorit (YYYYMMDD viittaa paketin
|
||||
julkaisupäivään):
|
||||
|
||||
fiwnsyn-fi-YYYYMMDD.zip – Suomen synonyymisanastot
|
||||
|
||||
fiwnsyn-en-YYYYMMDD.zip - Englannin synonyymisanastot (perustuvat
|
||||
Princeton WordNetiin)
|
||||
|
||||
fiwntransl-YYYYMMDD.zip - Suomi–englanti ja englanti–suomi
|
||||
käännössanakirjat
|
||||
|
||||
Tämä LUEMINUT-tiedosto on yhteinen kaikille paketeille.
|
||||
|
||||
Synonyymisanastotransduktorien nimet ovat muotoa
|
||||
fiwnsyn-KL-TYYPPI.hfstol, missä KL on kielikoodi ”fi” tai ”en” ja
|
||||
TYYPPI on yksi seuraavista:
|
||||
|
||||
infl – Transduktori tunnistaa syötesanan taivutusmuodon ja tuottaa
|
||||
synonyymit samassa taivutusmuodossa. Transduktori ei tunnista
|
||||
ei tuota monisanaisia synonyymeja. Sanaa ei lasketa itsensä
|
||||
synonyymiksi.
|
||||
|
||||
infl-refl – Sama kuin edellä, mutta synonymia on refleksiivistä:
|
||||
sana lasketaan itsensä synonyymiksi. Tämä mahdollistaa
|
||||
syötesanan mahdollisten vaihtoehtoisten taivutusmuotojen
|
||||
tuottamisen, kuten ”omenoiden”, ”omenoitten”, ”omenien”,
|
||||
”omenojen”, ”omenain”.
|
||||
|
||||
noinfl - Transduktori tunnistaa syötesanan taivutetuissa
|
||||
muodoissa, mutta tuottaa synonyymit perusmuodoissaan.
|
||||
Englannin transduktorit tunnistavat ja tuottavat myös
|
||||
monisanaiset ilmaukset ja suomen transduktorit tuottavat.
|
||||
Sanaa ei lasketa itsensä synonyymiksi.
|
||||
|
||||
noinfl-refl – Sama kuin edellä, mutta synonymia on refleksiivistä.
|
||||
|
||||
Käännössanakirjatransduktoritiedostojen nimet ovat
|
||||
fiwntransl-fien.hfstol (suomi–englanti) ja fiwntransl-enfi.hfstol
|
||||
(englanti–suomi). Ne tunnistavat syötesanan taivutettuja muotoja,
|
||||
mutta tuottavat käännökset perusmuodossa. Englanti–suomi-sanakirja
|
||||
sekä tunnistaa että tuottaa monisanaisia ilmauksia, kun taas
|
||||
suomi–englanti-sanakirja vain tuottaa niitä.
|
||||
|
||||
|
||||
Lähteitä
|
||||
--------
|
||||
|
||||
FinnWordNetin ja Princeton WordNetin datan lisäksi transduktorien
|
||||
tekemisessä on käytetty Omorfia, suomen avointa morfologista työkalua
|
||||
(http://gna.org/projects/omorfi), ja HFST:n englannin morfologiaa
|
||||
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
|
||||
joka on alunperin Måns Huldenin Princeton WordNetin datan pohjalta
|
||||
tekemä.
|
||||
|
||||
|
||||
Puutteita
|
||||
---------
|
||||
|
||||
* Monisanaisten ilmausten käsittely on osittain epäyhtenäistä.
|
||||
|
||||
* Suomen synonyymisanastot, erityisesti taivuttavat sanastot,
|
||||
tuottavat usein monia identtisiä tulostesanoja.
|
||||
|
||||
* Englannin taivuttava synonyymisanasto yligeneroi joitain
|
||||
sananmuotoja, kuten virheellisen kaksinkertaisen monikon genetiivin
|
||||
(”nets’s”) oikean (”nets’”) lisäksi.
|
||||
|
||||
* Ei-taivuttava englannin synonyymisanasto ja englanti–suomi-sanakirja
|
||||
tunnistavat taivutuksen monisanaisen ilmauksen viimeisessä sanassa,
|
||||
vaikka olisi oikein taivuttaa jotain aiempaa sanaa. Ne tunnistavat
|
||||
esimerkiksi ”arrive ated” oikean muodon ”arrived at” sijaan.
|
||||
|
||||
* Monitulkintaisen tai monimerkityksisen sananmuodon kaikki synonyymit
|
||||
tai käännökset luetellaan yhdessä, ilman järjestystä tai ryhmittelyä
|
||||
sanaluokan tai sananmerkityksen mukaan.
|
||||
|
||||
|
||||
Lisenssi
|
||||
--------
|
||||
|
||||
Koska FinnWordNet käyttää Princeton WordNetin rakennetta ja
|
||||
merkitysten selitteitä, se on PWN:n johdannainen ja siten PWN:n
|
||||
lisenssin alainen:
|
||||
|
||||
http://wordnet.princeton.edu/wordnet/license/
|
||||
|
||||
PWN:n lisenssi sallii vapaan käytön, myös kaupallisesti, kunhan sen
|
||||
käyttämisestä ja tekijänoikeuksista kerrotaan:
|
||||
|
||||
WordNet Release 3.0 This software and database is being provided
|
||||
to you, the LICENSEE, by Princeton University under the following
|
||||
license. By obtaining, using and/or copying this software and
|
||||
database, you agree that you have read, understood, and will
|
||||
comply with these terms and conditions.: Permission to use, copy,
|
||||
modify and distribute this software and database and its
|
||||
documentation for any purpose and without fee or royalty is hereby
|
||||
granted, provided that you agree to comply with the following
|
||||
copyright notice and statements, including the disclaimer, and
|
||||
that the same appear on ALL copies of the software, database and
|
||||
documentation, including modifications that you make for internal
|
||||
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
|
||||
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
|
||||
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
|
||||
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
|
||||
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
|
||||
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
|
||||
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
|
||||
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||
University or Princeton may not be used in advertising or
|
||||
publicity pertaining to distribution of the software and/or
|
||||
database. Title to copyright in this software, database and any
|
||||
associated documentation shall at all times remain with Princeton
|
||||
University and LICENSEE agrees to preserve same.
|
||||
|
||||
FinnWordNetin sisältämien sanojen suomenkielisten käännösten
|
||||
tekijänoikeudet ovat Helsinin yliopistolla. Ne lisensoidaan Creative
|
||||
Commons Nimeä (CC BY) 3.0 -lisenssillä, joka on samantapainen kuin
|
||||
PWN:n lisenssi:
|
||||
|
||||
http://creativecommons.org/licenses/by/3.0/deed.fi
|
||||
|
||||
Kun viittaat FinnWordNetiin, viittaa seuraavaan artikkeliin:
|
||||
|
||||
Krister Lindén and Lauri Carlson. 2010. FinnWordNet – WordNet på
|
||||
finska via översättning. LexicoNordica – Nordic Journal of
|
||||
Lexicography, 17:119–140.
|
||||
|
||||
|
||||
Yhteystiedot
|
||||
------------
|
||||
|
||||
FinnWordNet-projektia johtaa tutkimusjohtaja, FT Krister Lindén
|
||||
Helsingin yliopiston nykykielten laitoksessa (kieliteknologian
|
||||
oppiaineessa). Teknisissä kysymyksissä yhteyshenkilönä on
|
||||
projektitutkija Jyrki Niemi. Sähköpostiosoitteet ovat muotoa
|
||||
etunimi.sukunimi@helsinki.fi (aksentit poistettuina).
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,195 @@
|
||||
|
||||
HFST transducers based on FinnWordNet dictionary data
|
||||
=====================================================
|
||||
|
||||
|
||||
This package contains various HFST transducers based on FinnWordNet
|
||||
lexical data. The transducers can be used as (inflecting) Finnish or
|
||||
English thesauri or translation dictionaries.
|
||||
|
||||
|
||||
FinnWordNet
|
||||
-----------
|
||||
|
||||
FinnWordNet is a wordnet for Finnish. It was created by having
|
||||
professional translators translate the word senses of the Princeton
|
||||
WordNet (PWN) 3.0 into Finnish and by combining the translations with
|
||||
the PWN structure. FinnWordNet is a part of the FIN-CLARIN project:
|
||||
|
||||
http://www.ling.helsinki.fi/finclarin/
|
||||
|
||||
For more information about FinnWordNet, please visit the FinnWordNet
|
||||
project Web page
|
||||
|
||||
http://www.ling.helsinki.fi/en/lt/research/finnwordnet/
|
||||
|
||||
|
||||
HFST – Helsinki Finite-State Transducer Technology
|
||||
--------------------------------------------------
|
||||
|
||||
For more information about HFST, please see the project Web page
|
||||
|
||||
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/hfst/
|
||||
|
||||
The FinnWordNet transducers use the HFST optimized lookup format:
|
||||
|
||||
https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstOptimizedLookupFormat
|
||||
|
||||
The transducer files have the suffix .hfstol. Using them requires
|
||||
either the HFST library and tools (version 3.2.0 or later) or the
|
||||
standalone HFST optimized lookup program with which they can be run
|
||||
(applied):
|
||||
|
||||
http://sourceforge.net/projects/hfst/files/optimized-lookup/
|
||||
|
||||
The transducers require version 1.3 or later of the standalone
|
||||
optimized lookup or the Java implementation (hfst-ol.jar as of
|
||||
2011-05-23); the do not work with the Python implementation of
|
||||
2011-05-24.
|
||||
|
||||
|
||||
FinnWordNet transducer packages
|
||||
-------------------------------
|
||||
|
||||
The FinnWordNet transducers are divided into three packages, each with
|
||||
a few slightly different transducers (YYYYMMDD denotes the release
|
||||
date of the package):
|
||||
|
||||
fiwnsyn-fi-YYYYMMDD.zip – Finnish thesauri
|
||||
|
||||
fiwnsyn-en-YYYYMMDD.zip - English thesauri (based on the Princeton
|
||||
WordNet)
|
||||
|
||||
fiwntransl-YYYYMMDD.zip - Finnish–English and English–Finnish
|
||||
translation dictionaries.
|
||||
|
||||
This README file is common to all the packages.
|
||||
|
||||
The names of the thesaurus transducer files have the form
|
||||
fiwnsyn-LG-TYPE.hfstol, where LG is the language code ‘fi’ or ‘en’ and
|
||||
TYPE is one of the following:
|
||||
|
||||
infl – The transducer recognizes inflected forms of the input word
|
||||
and generates synonyms with the same form. Multi-word synonyms
|
||||
are not recognized nor generated. A word is not considered its
|
||||
own synonym.
|
||||
|
||||
infl-refl – The same as above but synonymy is reflexive: a word is
|
||||
considered its own synonym. This makes it possible to generate
|
||||
alternative forms of the input word, such as ‘indices’ and
|
||||
‘indexes’.
|
||||
|
||||
noinfl - The transducer recognizes inflected forms of the input
|
||||
word but generates synonyms in their base form. Multi-word
|
||||
synonyms are recognized and generated for English and
|
||||
generated for Finnish. A word is not considered its own
|
||||
synonym.
|
||||
|
||||
noinfl-refl – The same as above but synonymy is reflexive.
|
||||
|
||||
The names of the translation dictionary transducer files are
|
||||
fiwntransl-fien.hfstol for the Finnish–English dictionary and
|
||||
fiwntransl-enfi.hfstol for the English–Finnish one. They recognize
|
||||
inflected forms of the input word but generate the base form of the
|
||||
translation. The English–Finnish dictionary recognizes and generates
|
||||
multi-word translations, whereas the Finnish–English one only
|
||||
generates them.
|
||||
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
In addition to the FinnWordNet and Princeton WordNet data, the
|
||||
transducers have been constructed using the Omorfi open morphology
|
||||
tool for Finnish (http://gna.org/projects/omorfi) and the HFST English
|
||||
morphology
|
||||
(http://sourceforge.net/projects/hfst/files/morphological-transducers/hfst-english.tar.gz/download),
|
||||
originally by Måns Hulden, based on Princeton WordNet data.
|
||||
|
||||
|
||||
Deficiencies
|
||||
------------
|
||||
|
||||
* Multi-word expressions are handled somewhat inconsistently.
|
||||
|
||||
* The Finnish thesauri, in particular the inflecting ones, often
|
||||
generate many identical output words.
|
||||
|
||||
* The inflecting English thesaurus overgenerates some word forms, such
|
||||
as an incorrect double plural genitive (‘nets’s’) in addition to the
|
||||
correct one (‘nets’’).
|
||||
|
||||
* The non-inflecting English thesauri and the English–Finnish
|
||||
dictionary recognize inflection in the last word of a multi-word
|
||||
expression, even if it would be correct to inflect a preceding word.
|
||||
For example, they recognize ‘arrive ated’ but not the correct
|
||||
‘arrived at’.
|
||||
|
||||
* All the synonyms or translations of an ambiguous or polysemous word
|
||||
form are listed together, without any sorting or grouping according
|
||||
to the part of speech or word sense.
|
||||
|
||||
|
||||
Licence
|
||||
-------
|
||||
|
||||
Since FinnWordNet retains the structure and glosses of Princeton
|
||||
WordNet, it is a derivative of PWN subject to the PWN licence:
|
||||
|
||||
http://wordnet.princeton.edu/wordnet/license/
|
||||
|
||||
The PWN licence allows free use, including commercial use, provided
|
||||
that a copyright notice is given:
|
||||
|
||||
WordNet Release 3.0 This software and database is being provided
|
||||
to you, the LICENSEE, by Princeton University under the following
|
||||
license. By obtaining, using and/or copying this software and
|
||||
database, you agree that you have read, understood, and will
|
||||
comply with these terms and conditions.: Permission to use, copy,
|
||||
modify and distribute this software and database and its
|
||||
documentation for any purpose and without fee or royalty is hereby
|
||||
granted, provided that you agree to comply with the following
|
||||
copyright notice and statements, including the disclaimer, and
|
||||
that the same appear on ALL copies of the software, database and
|
||||
documentation, including modifications that you make for internal
|
||||
use or for distribution. WordNet 3.0 Copyright 2006 by Princeton
|
||||
University. All rights reserved. THIS SOFTWARE AND DATABASE IS
|
||||
PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
|
||||
OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR
|
||||
WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR
|
||||
PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR
|
||||
DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
|
||||
COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||
University or Princeton may not be used in advertising or
|
||||
publicity pertaining to distribution of the software and/or
|
||||
database. Title to copyright in this software, database and any
|
||||
associated documentation shall at all times remain with Princeton
|
||||
University and LICENSEE agrees to preserve same.
|
||||
|
||||
The translations of FinnWordNet are copyright of the University of
|
||||
Helsinki and they are licenced under Creative Commons Attribution (CC
|
||||
BY) 3.0, which is similar to the PWN licence:
|
||||
|
||||
http://creativecommons.org/licenses/by/3.0/
|
||||
|
||||
Please cite the following paper when referring to FinnWordNet:
|
||||
|
||||
Krister Lindén and Lauri Carlson. 2010. FinnWordNet – WordNet på
|
||||
finska via översättning. LexicoNordica – Nordic Journal of
|
||||
Lexicography, 17:119–140.
|
||||
|
||||
HFST is licenced under the GNU Lesser General Public License, version
|
||||
3.0:
|
||||
|
||||
http://www.gnu.org/licenses/lgpl.html
|
||||
|
||||
|
||||
Contact
|
||||
-------
|
||||
|
||||
The FinnWordNet project is led by Dr Krister Lindén at the Department
|
||||
of Modern Languages (Language Technology) of the University of
|
||||
Helsinki. In technical questions, please contact Mr Jyrki Niemi. Email
|
||||
addresses are of the form firstname.lastname@helsinki.fi (accents
|
||||
removed).
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user