Files
net-deertopia/src/net/deertopia/doerg/element.clj
Madeleine Sydney Ślaga 6e9531f944
Some checks failed
build / build (push) Failing after 36s
refactor: doerg는 publisher와 결합
2026-04-03 13:21:00 -06:00

304 lines
10 KiB
Clojure
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
(ns net.deertopia.doerg.element
(:refer-clojure :exclude [read-string type])
(:require [babashka.fs :as fs]
[babashka.process :as p]
[cheshire.core :as json]
[clojure.core.match :refer [match]]
[net.deertopia.doerg :as-alias doerg]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.string :as str]
[clojure.test.check.generators :as gen]
[clojure.tools.logging.readable :as lr]
[clojure.zip :as z]
[com.rpl.specter :as sp]
[com.rpl.specter.zipper :as sz]
[net.deertopia.doerg.common :as common]
[net.deertopia.doerg.config :as cfg]
[clojure.tools.logging :as l])
(:import
(java.util UUID)))
(def ^:dynamic *uniorg-timeout-duration*
"Number of milliseconds to wait before killing the external Uniorg
process."
(* 10 1000))
(defn- camel->kebab [s]
(->> (str/split s #"(?<=[a-z])(?=[A-Z])")
(map str/lower-case)
(str/join "-")))
(defn uniorg [& {:keys [in]
:or {in *in*}}]
(let [r (-> (p/process
{:in in :out :string}
(-> cfg/*cfg* ::doerg/doerg-parser str))
(common/deref-with-timeout *uniorg-timeout-duration*))]
(when (zero? (:exit r))
(-> r :out (json/parse-string (comp keyword camel->kebab))))))
(declare gather-first-section gather-latex-paragraphs element-types)
(defn read-string
[s & {:keys [post-processors]
:or {post-processors [gather-first-section
gather-latex-paragraphs]}}]
(let [apply-post-processors (apply comp (reverse post-processors))]
(with-in-str s
(-> (uniorg :in *in*)
apply-post-processors))))
(defn greater-element?
"Return truthy if `e` is a greater org-element; i.e. one that can
have children."
[e]
;; Not 100% sure if this is a valid definition. It seems that
;; Uniorg sets `:children` to an empty vector when a great element
;; lacks children.
(and (map? e) (contains? e :children)))
(defn org-element? [element]
(and (map? element)
(contains? element :type)))
(defn type [element]
(:type element))
(defn of-type?
"Return truthy if the Org node `element` is of type `type`. In the
vararg case, return truthy if `element` is of any of the types
listed."
([element type]
(= (:type element) type))
([element type & types]
(contains? (into #{} (cons type types))
(:type element))))
(defn of-keyword-type? [element key]
(and (of-type? element "keyword")
(= (:key element) key)))
(defn footnotes-section? [element]
(and (of-type? element "section")
(when-some [footnotes-headline (first (:children element))]
(= "Footnotes" (:raw-value footnotes-headline)))))
(defn display-math?
"Return truthy if `element` should be considered display math."
[element]
(or (of-type? element "latex-environment")
(and (of-type? element "latex-fragment")
(-> element :contents (str/starts-with? "\\[")))))
;;; Zipper
(defn doerg-zip [document]
(clojure.zip/zipper greater-element?
:children
#(assoc %1 :children %2)
document))
(def children-walker
"Walk each child of an Org element."
[:children sp/ALL])
(def postorder-walker
"Recursively walk each node of an Org element in post-order."
(sp/recursive-path
[] p
(sp/if-path greater-element?
(sp/continue-then-stay children-walker p)
sp/STAY)))
(def postorder-walker*
(sp/recursive-path
[] p
(sp/if-path greater-element?
(sp/continue-then-stay :children p)
sp/STAY)))
;;; Post-processing
(def property-handlers
"A map of node-property keys to functions. The functions will be
called with two arguments: the pre-existing top-level document data
(a map), and the node-property value. The function is expected to
return the document data map with the new property merged in."
{"ID" (fn [data id]
(let [new-id (UUID/fromString id)]
(when (contains? data :id)
(lr/warnf (str "Found multiple :ID: definitions."
" Replacing %s with %s.")
(:id data) new-id))
(assoc data :id new-id)))
"DeertopiaVisibility"
(fn [data visibility]
(let [v (case visibility
"public" :public
"private" :private
"graphonly" :graph-only
(do (lr/warn "Unknown visibility: %s" visibility)
:private))]
(assoc data :net.deertopia/visibility v)))})
(def keyword-handlers
"Like `property-handlers`, but for top-level keywords."
{"TITLE" #(assoc %1 :title %2)})
(defn- apply-handlers [handlers values]
(reduce (fn [data {:keys [key value]}]
(let [f (get handlers key)]
(f data value)))
{}
values))
(defn handle-properties [doc]
(let [props (some-> doc :children first)]
(when (of-type? props "property-drawer")
(->> props
(sp/select [children-walker
#(contains? property-handlers (:key %))])
(apply-handlers property-handlers)))))
(defn handle-keywords [doc]
(->> doc
(sp/select [children-walker
#(and (of-type? % "keyword")
(contains? keyword-handlers (:key %)))])
(apply-handlers keyword-handlers)))
(defn gather-doerg-data [doc]
(assoc doc :net.deertopia.doerg/data
(merge (handle-properties doc)
(handle-keywords doc))))
(defn- split-sections
"Given a list of top-level nodes as spat out by the `uniorg`
parser, return a map with the following keys
• :top-level-nodes The nodes that /should/ be at the top-level.
• :first-section-nodes The nodes that should be wrapped in a new
section node.
• :rest Everything else."
[nodes]
(let [[of-top-level remaining-nodes]
(->> nodes (split-with #(of-type? % "property-drawer" "keyword")))
[of-first-section remaining-nodes*]
(->> remaining-nodes (split-with #(not (of-type? % "section"))))]
{:top-level-nodes of-top-level
:first-section-nodes of-first-section
:rest remaining-nodes*}))
(defn- element-bounds [& nodes]
(reduce (fn [acc {:keys [contents-begin contents-end]}]
(if (and (nat-int? contents-begin)
(nat-int? contents-end))
(-> acc
(update
:contents-begin
#(min (or % Integer/MAX_VALUE) contents-begin))
(update
:contents-end
#(max (or % Integer/MIN_VALUE) contents-end)))
acc))
{:contents-begin nil
:contents-end nil}
nodes))
(defn gather-first-section [node]
(assert (of-type? node "org-data")
"`gather-doerg-data` should be applied to the document root.")
(let [{:keys [top-level-nodes first-section-nodes rest]}
(split-sections (:children node))
;; TODO: Construct `:contents-begin` and `:contents-end` data
;; by spanning the children.
first-section (merge {:type "section"
:children (vec first-section-nodes)}
(apply element-bounds first-section-nodes))
new-children (vec (concat top-level-nodes
(list first-section)
rest))]
(assoc node :children new-children)))
(defn- newline-final-paragraph?
"Is `e` a paragraph, and does it end with a newline?"
[e]
(and (of-type? e "paragraph")
(some-> (-> e :position :end :column)
(= 1))))
(defn consequtive-elements?
"Returh truthy if each successive pair of elements is NOT separated
by at least one explicit paragraph break; i.e. a blank line."
[& elements]
(match elements
([(e :guard newline-final-paragraph?) e & es] :seq)
(and (= (-> e :position :end :line)
(-> e :position :start :line))
(recur es))
([e e & es] :seq)
(and (= (-> e :position :end :line inc)
(-> e :position :start :line))
(recur es))
([_] :seq) true
([] :seq) true))
(defn swallow
([predator prey]
(assert (greater-element? predator))
(-> predator
(update :children #(conj % prey))
(assoc-in [:position :end] (-> prey :position :end))))
([predator prey & more-prey]
(reduce swallow predator (cons prey more-prey))))
(defn- paragraph-followed-by-tex? [children]
(match children
[(para :guard #(of-type? % "paragraph"))
(tex :guard #(of-type? % "latex-environment"))
& _]
(consequtive-elements? para tex)
:else false))
(defn- paragraph-followed-by-paragraph? [children]
(match children
[(para :guard #(of-type? % "paragraph"))
(para :guard #(of-type? % "paragraph"))
& _]
(consequtive-elements? para para)
:else false))
(defn gather-latex-paragraphs [node]
(->> node
(sp/transform
[postorder-walker (sp/must :children)]
(fn [children]
(loop [acc []
cs (vec children)]
(match cs
;; CASE: A paragraph followed by a LaTeX environment.
;; If there are no blank lines separating the paragraph
;; from the LaTeX environment, the LaTeX environment
;; shall become a child of the paragraph.
([para tex & rest] :guard paragraph-followed-by-tex?)
(recur acc (vec (cons (swallow para tex) rest)))
;; CASE: Similar to the paragraph-followed-by-tex case,
;; but instead of swallowing the entire second element,
;; we swallow the /children/ of the second element,
;; since paragraphs cannot be nested.
([para para & rest]
:guard paragraph-followed-by-paragraph?)
(recur acc (vec (cons (apply swallow para (:children para))
rest)))
;; CASE: Irrelevant or empty!
[c & rest]
(recur (conj acc c) rest)
[] acc))))))