hangul syllables

This commit is contained in:
2026-02-12 09:45:52 -07:00
parent 0ba6518630
commit fd36692419
8 changed files with 197 additions and 81 deletions

12
scripts/search-hangul Executable file
View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bb
(require '[babashka.fs :as fs]
'[clojure.string :as str])
(doseq [f (fs/glob "lab1/grammar/korean" "**.gf")
[line-number line] (map-indexed (fn [i x] [i x])
(-> f str slurp str/split-lines))]
(when (re-find #"\p{block=HangulJamo}" line)
(printf "JAMO: %s:%d: %s\n" (str f) line-number line))
(when (re-find #"\p{block=HangulSyllables}" line)
(printf "SYLLABLE: %s:%d: %s\n" (str f) line-number line)))

44
scripts/syllables-to-jamo Executable file
View File

@@ -0,0 +1,44 @@
#!/usr/bin/env bb
(require '[babashka.fs :as fs]
'[clojure.string :as str])
(defn syllables->jamo [s]
"Convert Hangul syllables in string S to their jamo components."
(->> s
(map int)
(mapcat
(fn [c]
(if (<= 0xAC00 c 0xD7A3)
;; Hangul syllable - decompose
(let [code (- c 0xAC00)
lead (quot code (* 21 28))
medial (quot (mod code (* 21 28)) 28)
final (mod code 28)
lead-jamo (+ 0x1100 lead)
medial-jamo (+ 0x1161 medial)
final-jamo (if (> final 0) (+ 0x11A7 final) nil)]
(remove nil? [lead-jamo medial-jamo final-jamo]))
;; Not a Hangul syllable
[c])))
(map char)
(apply str)))
(defn -main [& args]
(if (seq args)
(doseq [f args]
(let [x (-> (slurp f)
(str/replace #"\p{block=HangulSyllables}+"
syllables->jamo))]
(spit f x)))
(loop [line (read-line)]
(when line
(-> line
(str/replace #"\p{block=HangulSyllables}+"
syllables->jamo)
println)
(recur (read-line))))))
(when (= *file* (System/getProperty "babashka.file"))
(apply -main *command-line-args*))