From 4873e78db86d5f6271bfb9e0e719cf50575035b9 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 8 Aug 2019 15:15:54 +0200 Subject: [PATCH] extraction of all major Ice categories from Wiktionary; however, some manual post-editing still needed in the generated files --- .../wiktionary/CheckWithWiktionary.hs | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/icelandic/wiktionary/CheckWithWiktionary.hs b/src/icelandic/wiktionary/CheckWithWiktionary.hs index b5098d4e6..278af2b0d 100644 --- a/src/icelandic/wiktionary/CheckWithWiktionary.hs +++ b/src/icelandic/wiktionary/CheckWithWiktionary.hs @@ -2,6 +2,7 @@ import qualified Data.Map import qualified Data.Text.IO import Data.Char import Data.List +import System.Directory -- AR 2019-08-06 -- checking IrregIce wrt Wikipedia: @@ -144,6 +145,11 @@ jumpToIcelandic ls = dropWhile (\l -> not (isPrefixOf "

" l && isPrefixOf "Ic -- to be run in wiktionary/, with subdirs nouns/ adjectives/ verbs/ -- to be run in adjectives/ +getAllWiktNouns = do + vs <- readFile "nouns/wikt-nouns.txt" >>= return . lines + writeFile "n.tmp" "" + mapM_ (\v -> getWiktNoun "nouns/" v >>= appendFile "n.tmp" . unlines . emitGF) vs + getAllWiktAdjectives = do vs <- readFile "adjectives/wikt-adjectives.txt" >>= return . lines writeFile "a.tmp" "" @@ -159,8 +165,13 @@ getAllWiktVerbs = do -- return ([relevant Wikt lines], (fun,cat,lin), message) getWiktWord :: Int -> (String -> [String] -> ([String],((String,String,String),Message))) -> FilePath -> FilePath -> IO ([String],((String,String,String),Message)) getWiktWord number check dir file = do - s <- readFile (dir++file) >>= return . map untag . take number . getTD . jumpToIcelandic . lines - return $ check file s + let dirfile = dir++file + ex <- doesFileExist dirfile + if not ex + then return ([],(noGF,MBad (file ++ " does not exist"))) + else do + s <- readFile dirfile >>= return . map untag . take number . getTD . jumpToIcelandic . lines + return $ check file s getWiktNoun = getWiktWord 17 checkNoun getWiktAdjective = getWiktWord 120 checkAdjective @@ -170,23 +181,26 @@ noCheck :: String -> [String] -> ([String],((String,String,String),Message)) noCheck s ss = (ss, (noGF, MMissing s)) noGF = ("--","--","--") -checkNoun noun forms = noCheck noun forms ---- +checkNoun noun forms = case length forms of +---- n | n < 24 -> (forms, (noGF, MBad (adj ++ " A: only " ++ show (length forms) ++ " lines"))) + n | n < 17 -> (forms, (noGF, MBad (noun ++ " N: only " ++ show (length forms) ++ " lines"))) + _ -> (forms, checkZZ noun (noun ++ "_N", "N", "mkgN " ++ gender (forms!!0), [forms!!i | i <- [1,5,9,13,3,7,11,15]])) + where + gender s = case take 1 s of + "m" -> "masculine" + "f" -> "feminine" + _ -> "neuter" --- "n" + checkAdjective adj forms = case length forms of n | n < 24 -> (forms, (noGF, MBad (adj ++ " A: only " ++ show (length forms) ++ " lines"))) n | n < 120 -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1])) _ -> (forms, checkZZ adj (adj ++ "_A", "A", "mkA", [forms!!0, forms!!1,forms!!48])) -checkZZ w (fun,cat,lin,args) = - if elem "ZZ" (map (wform . words) args) - then (noGF, MBad (w ++ " " ++ cat ++ ": missing forms in data")) - else ((fun,cat, app lin args),MGood w) - -checkVerb verb forms = - if length forms < 75 - then (forms, (noGF, MBad (verb ++ " V: only " ++ show (length forms) ++ " lines"))) - else case unexpectedWikLines forms of - [] -> (forms, ((verb ++ "_V", "V", app "mkV" [verb, forms!!5, forms!!18, forms!!74, forms!!1]), MGood verb)) ---- +checkVerb verb forms = case length forms of + n | n < 75 -> (forms, (noGF, MBad (verb ++ " V: only " ++ show (length forms) ++ " lines"))) + _ -> case unexpectedWikLines forms of + [] -> (forms, checkZZ verb (verb ++ "_V", "V", "mkV", [verb, forms!!5, forms!!18, forms!!74, forms!!1])) us -> (forms, (noGF, MBad (verb ++ " V: unexpected lines " ++ show (length us)))) data Message = @@ -198,6 +212,11 @@ data Message = app f xs = unwords $ f : map (quote . wform . words) xs quote s = "\"" ++ s ++ "\"" +checkZZ w (fun,cat,lin,args) = + if elem "ZZ" (map (wform . words) args) + then (noGF, MBad (w ++ " " ++ cat ++ ": missing forms in data")) + else ((fun,cat, app lin args),MGood w) + emitGF (ss,((fun,cat,lin),msg)) = case msg of MGood _ -> [unwords ["fun",fun,":",cat,";"],unwords ["lin",fun,"=",lin,";"]] _ -> ["-- " ++ show msg]