PGF Service: a bit more clever lexer=text

Only change the first word to lowercase if the original input is not found in
the grammar's morphology. This allows parsing of sentenses starting with "I" in 
English, nouns in German and proper names in other languages, but it can make
the wrong choice for multi-words.
This commit is contained in:
hallgren
2014-04-09 14:13:18 +00:00
parent 4479bb81b7
commit 04a6260eea
2 changed files with 82 additions and 35 deletions

View File

@@ -2,8 +2,13 @@ module PGF.Lexing where
import Data.Char(isSpace,toLower,toUpper)
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: String -> [String]
lexText = uncap . lext where
lexText = lexText' uncapitInit
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
@@ -11,7 +16,7 @@ lexText = uncap . lext where
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
(c:cs):ws -> (toLower c : cs):ws
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
@@ -78,6 +83,11 @@ capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- | Uncapitalize first letter
uncapitInit s = case s of
c:cs -> toLower c : cs
_ -> s
-- | Unquote each string wrapped in double quotes
unquote = map unq where
unq s = case s of