mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-28 01:18:57 -06:00
extraction of all major Ice categories from Wiktionary; however, some manual post-editing still needed in the generated files
This commit is contained in:
@@ -2,6 +2,7 @@ import qualified Data.Map
|
|||||||
import qualified Data.Text.IO
|
import qualified Data.Text.IO
|
||||||
import Data.Char
|
import Data.Char
|
||||||
import Data.List
|
import Data.List
|
||||||
|
import System.Directory
|
||||||
|
|
||||||
-- AR 2019-08-06
|
-- AR 2019-08-06
|
||||||
-- checking IrregIce wrt Wikipedia:
|
-- checking IrregIce wrt Wikipedia:
|
||||||
@@ -144,6 +145,11 @@ jumpToIcelandic ls = dropWhile (\l -> not (isPrefixOf "<h2>" l && isPrefixOf "Ic
|
|||||||
-- to be run in wiktionary/, with subdirs nouns/ adjectives/ verbs/
|
-- to be run in wiktionary/, with subdirs nouns/ adjectives/ verbs/
|
||||||
|
|
||||||
-- to be run in adjectives/
|
-- to be run in adjectives/
|
||||||
|
getAllWiktNouns = do
|
||||||
|
vs <- readFile "nouns/wikt-nouns.txt" >>= return . lines
|
||||||
|
writeFile "n.tmp" ""
|
||||||
|
mapM_ (\v -> getWiktNoun "nouns/" v >>= appendFile "n.tmp" . unlines . emitGF) vs
|
||||||
|
|
||||||
getAllWiktAdjectives = do
|
getAllWiktAdjectives = do
|
||||||
vs <- readFile "adjectives/wikt-adjectives.txt" >>= return . lines
|
vs <- readFile "adjectives/wikt-adjectives.txt" >>= return . lines
|
||||||
writeFile "a.tmp" ""
|
writeFile "a.tmp" ""
|
||||||
@@ -159,7 +165,12 @@ getAllWiktVerbs = do
|
|||||||
-- return ([relevant Wikt lines], (fun,cat,lin), message)
|
-- return ([relevant Wikt lines], (fun,cat,lin), message)
|
||||||
getWiktWord :: Int -> (String -> [String] -> ([String],((String,String,String),Message))) -> FilePath -> FilePath -> IO ([String],((String,String,String),Message))
|
getWiktWord :: Int -> (String -> [String] -> ([String],((String,String,String),Message))) -> FilePath -> FilePath -> IO ([String],((String,String,String),Message))
|
||||||
getWiktWord number check dir file = do
|
getWiktWord number check dir file = do
|
||||||
s <- readFile (dir++file) >>= return . map untag . take number . getTD . jumpToIcelandic . lines
|
let dirfile = dir++file
|
||||||
|
ex <- doesFileExist dirfile
|
||||||
|
if not ex
|
||||||
|
then return ([],(noGF,MBad (file ++ " does not exist")))
|
||||||
|
else do
|
||||||
|
s <- readFile dirfile >>= return . map untag . take number . getTD . jumpToIcelandic . lines
|
||||||
return $ check file s
|
return $ check file s
|
||||||
|
|
||||||
getWiktNoun = getWiktWord 17 checkNoun
|
getWiktNoun = getWiktWord 17 checkNoun
|
||||||
@@ -170,23 +181,26 @@ noCheck :: String -> [String] -> ([String],((String,String,String),Message))
|
|||||||
noCheck s ss = (ss, (noGF, MMissing s))
|
noCheck s ss = (ss, (noGF, MMissing s))
|
||||||
noGF = ("--","--","--")
|
noGF = ("--","--","--")
|
||||||
|
|
||||||
checkNoun noun forms = noCheck noun forms ----
|
checkNoun noun forms = case length forms of
|
||||||
|
---- n | n < 24 -> (forms, (noGF, MBad (adj ++ " A: only " ++ show (length forms) ++ " lines")))
|
||||||
|
n | n < 17 -> (forms, (noGF, MBad (noun ++ " N: only " ++ show (length forms) ++ " lines")))
|
||||||
|
_ -> (forms, checkZZ noun (noun ++ "_N", "N", "mkgN " ++ gender (forms!!0), [forms!!i | i <- [1,5,9,13,3,7,11,15]]))
|
||||||
|
where
|
||||||
|
gender s = case take 1 s of
|
||||||
|
"m" -> "masculine"
|
||||||
|
"f" -> "feminine"
|
||||||
|
_ -> "neuter" --- "n"
|
||||||
|
|
||||||
|
|
||||||
checkAdjective adj forms = case length forms of
|
checkAdjective adj forms = case length forms of
|
||||||
n | n < 24 -> (forms, (noGF, MBad (adj ++ " A: only " ++ show (length forms) ++ " lines")))
|
n | n < 24 -> (forms, (noGF, MBad (adj ++ " A: only " ++ show (length forms) ++ " lines")))
|
||||||
n | n < 120 -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1]))
|
n | n < 120 -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1]))
|
||||||
_ -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1,forms!!48]))
|
_ -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1,forms!!48]))
|
||||||
|
|
||||||
checkZZ w (fun,cat,lin,args) =
|
checkVerb verb forms = case length forms of
|
||||||
if elem "ZZ" (map (wform . words) args)
|
n | n < 75 -> (forms, (noGF, MBad (verb ++ " V: only " ++ show (length forms) ++ " lines")))
|
||||||
then (noGF, MBad (w ++ " " ++ cat ++ ": missing forms in data"))
|
_ -> case unexpectedWikLines forms of
|
||||||
else ((fun,cat, app lin args),MGood w)
|
[] -> (forms, checkZZ verb (verb ++ "_V", "V", "mkV", [verb, forms!!5, forms!!18, forms!!74, forms!!1]))
|
||||||
|
|
||||||
checkVerb verb forms =
|
|
||||||
if length forms < 75
|
|
||||||
then (forms, (noGF, MBad (verb ++ " V: only " ++ show (length forms) ++ " lines")))
|
|
||||||
else case unexpectedWikLines forms of
|
|
||||||
[] -> (forms, ((verb ++ "_V", "V", app "mkV" [verb, forms!!5, forms!!18, forms!!74, forms!!1]), MGood verb)) ----
|
|
||||||
us -> (forms, (noGF, MBad (verb ++ " V: unexpected lines " ++ show (length us))))
|
us -> (forms, (noGF, MBad (verb ++ " V: unexpected lines " ++ show (length us))))
|
||||||
|
|
||||||
data Message =
|
data Message =
|
||||||
@@ -198,6 +212,11 @@ data Message =
|
|||||||
app f xs = unwords $ f : map (quote . wform . words) xs
|
app f xs = unwords $ f : map (quote . wform . words) xs
|
||||||
quote s = "\"" ++ s ++ "\""
|
quote s = "\"" ++ s ++ "\""
|
||||||
|
|
||||||
|
checkZZ w (fun,cat,lin,args) =
|
||||||
|
if elem "ZZ" (map (wform . words) args)
|
||||||
|
then (noGF, MBad (w ++ " " ++ cat ++ ": missing forms in data"))
|
||||||
|
else ((fun,cat, app lin args),MGood w)
|
||||||
|
|
||||||
emitGF (ss,((fun,cat,lin),msg)) = case msg of
|
emitGF (ss,((fun,cat,lin),msg)) = case msg of
|
||||||
MGood _ -> [unwords ["fun",fun,":",cat,";"],unwords ["lin",fun,"=",lin,";"]]
|
MGood _ -> [unwords ["fun",fun,":",cat,";"],unwords ["lin",fun,"=",lin,";"]]
|
||||||
_ -> ["-- " ++ show msg]
|
_ -> ["-- " ++ show msg]
|
||||||
|
|||||||
Reference in New Issue
Block a user