This commit is contained in:
303
src/net/deertopia/doerg/element.clj
Normal file
303
src/net/deertopia/doerg/element.clj
Normal file
@@ -0,0 +1,303 @@
|
||||
(ns net.deertopia.doerg.element
|
||||
(:refer-clojure :exclude [read-string type])
|
||||
(:require [babashka.fs :as fs]
|
||||
[babashka.process :as p]
|
||||
[cheshire.core :as json]
|
||||
[clojure.core.match :refer [match]]
|
||||
[net.deertopia.doerg :as-alias doerg]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.set :as set]
|
||||
[clojure.string :as str]
|
||||
[clojure.test.check.generators :as gen]
|
||||
[clojure.tools.logging.readable :as lr]
|
||||
[clojure.zip :as z]
|
||||
[com.rpl.specter :as sp]
|
||||
[com.rpl.specter.zipper :as sz]
|
||||
[net.deertopia.doerg.common :as common]
|
||||
[net.deertopia.doerg.config :as cfg]
|
||||
[clojure.tools.logging :as l])
|
||||
(:import
|
||||
(java.util UUID)))
|
||||
|
||||
|
||||
|
||||
(def ^:dynamic *uniorg-timeout-duration*
|
||||
"Number of milliseconds to wait before killing the external Uniorg
|
||||
process."
|
||||
(* 10 1000))
|
||||
|
||||
(defn- camel->kebab [s]
|
||||
(->> (str/split s #"(?<=[a-z])(?=[A-Z])")
|
||||
(map str/lower-case)
|
||||
(str/join "-")))
|
||||
|
||||
(defn uniorg [& {:keys [in]
|
||||
:or {in *in*}}]
|
||||
(let [r (-> (p/process
|
||||
{:in in :out :string}
|
||||
(-> cfg/*cfg* ::doerg/doerg-parser str))
|
||||
(common/deref-with-timeout *uniorg-timeout-duration*))]
|
||||
(when (zero? (:exit r))
|
||||
(-> r :out (json/parse-string (comp keyword camel->kebab))))))
|
||||
|
||||
(declare gather-first-section gather-latex-paragraphs element-types)
|
||||
|
||||
(defn read-string
|
||||
[s & {:keys [post-processors]
|
||||
:or {post-processors [gather-first-section
|
||||
gather-latex-paragraphs]}}]
|
||||
(let [apply-post-processors (apply comp (reverse post-processors))]
|
||||
(with-in-str s
|
||||
(-> (uniorg :in *in*)
|
||||
apply-post-processors))))
|
||||
|
||||
|
||||
|
||||
(defn greater-element?
|
||||
"Return truthy if `e` is a greater org-element; i.e. one that can
|
||||
have children."
|
||||
[e]
|
||||
;; Not 100% sure if this is a valid definition. It seems that
|
||||
;; Uniorg sets `:children` to an empty vector when a great element
|
||||
;; lacks children.
|
||||
(and (map? e) (contains? e :children)))
|
||||
|
||||
(defn org-element? [element]
|
||||
(and (map? element)
|
||||
(contains? element :type)))
|
||||
|
||||
(defn type [element]
|
||||
(:type element))
|
||||
|
||||
(defn of-type?
|
||||
"Return truthy if the Org node `element` is of type `type`. In the
|
||||
vararg case, return truthy if `element` is of any of the types
|
||||
listed."
|
||||
([element type]
|
||||
(= (:type element) type))
|
||||
([element type & types]
|
||||
(contains? (into #{} (cons type types))
|
||||
(:type element))))
|
||||
|
||||
(defn of-keyword-type? [element key]
|
||||
(and (of-type? element "keyword")
|
||||
(= (:key element) key)))
|
||||
|
||||
(defn footnotes-section? [element]
|
||||
(and (of-type? element "section")
|
||||
(when-some [footnotes-headline (first (:children element))]
|
||||
(= "Footnotes" (:raw-value footnotes-headline)))))
|
||||
|
||||
(defn display-math?
|
||||
"Return truthy if `element` should be considered display math."
|
||||
[element]
|
||||
(or (of-type? element "latex-environment")
|
||||
(and (of-type? element "latex-fragment")
|
||||
(-> element :contents (str/starts-with? "\\[")))))
|
||||
|
||||
|
||||
;;; Zipper
|
||||
|
||||
(defn doerg-zip [document]
|
||||
(clojure.zip/zipper greater-element?
|
||||
:children
|
||||
#(assoc %1 :children %2)
|
||||
document))
|
||||
|
||||
(def children-walker
|
||||
"Walk each child of an Org element."
|
||||
[:children sp/ALL])
|
||||
|
||||
(def postorder-walker
|
||||
"Recursively walk each node of an Org element in post-order."
|
||||
(sp/recursive-path
|
||||
[] p
|
||||
(sp/if-path greater-element?
|
||||
(sp/continue-then-stay children-walker p)
|
||||
sp/STAY)))
|
||||
|
||||
(def postorder-walker*
|
||||
(sp/recursive-path
|
||||
[] p
|
||||
(sp/if-path greater-element?
|
||||
(sp/continue-then-stay :children p)
|
||||
sp/STAY)))
|
||||
|
||||
|
||||
;;; Post-processing
|
||||
|
||||
(def property-handlers
|
||||
"A map of node-property keys to functions. The functions will be
|
||||
called with two arguments: the pre-existing top-level document data
|
||||
(a map), and the node-property value. The function is expected to
|
||||
return the document data map with the new property merged in."
|
||||
{"ID" (fn [data id]
|
||||
(let [new-id (UUID/fromString id)]
|
||||
(when (contains? data :id)
|
||||
(lr/warnf (str "Found multiple :ID: definitions."
|
||||
" Replacing %s with %s.")
|
||||
(:id data) new-id))
|
||||
(assoc data :id new-id)))
|
||||
"DeertopiaVisibility"
|
||||
(fn [data visibility]
|
||||
(let [v (case visibility
|
||||
"public" :public
|
||||
"private" :private
|
||||
"graphonly" :graph-only
|
||||
(do (lr/warn "Unknown visibility: %s" visibility)
|
||||
:private))]
|
||||
(assoc data :net.deertopia/visibility v)))})
|
||||
|
||||
(def keyword-handlers
|
||||
"Like `property-handlers`, but for top-level keywords."
|
||||
{"TITLE" #(assoc %1 :title %2)})
|
||||
|
||||
(defn- apply-handlers [handlers values]
|
||||
(reduce (fn [data {:keys [key value]}]
|
||||
(let [f (get handlers key)]
|
||||
(f data value)))
|
||||
{}
|
||||
values))
|
||||
|
||||
(defn handle-properties [doc]
|
||||
(let [props (some-> doc :children first)]
|
||||
(when (of-type? props "property-drawer")
|
||||
(->> props
|
||||
(sp/select [children-walker
|
||||
#(contains? property-handlers (:key %))])
|
||||
(apply-handlers property-handlers)))))
|
||||
|
||||
(defn handle-keywords [doc]
|
||||
(->> doc
|
||||
(sp/select [children-walker
|
||||
#(and (of-type? % "keyword")
|
||||
(contains? keyword-handlers (:key %)))])
|
||||
(apply-handlers keyword-handlers)))
|
||||
|
||||
(defn gather-doerg-data [doc]
|
||||
(assoc doc :net.deertopia.doerg/data
|
||||
(merge (handle-properties doc)
|
||||
(handle-keywords doc))))
|
||||
|
||||
(defn- split-sections
|
||||
"Given a list of top-level nodes as spat out by the `uniorg`
|
||||
parser, return a map with the following keys
|
||||
• :top-level-nodes The nodes that /should/ be at the top-level.
|
||||
• :first-section-nodes The nodes that should be wrapped in a new
|
||||
section node.
|
||||
• :rest Everything else."
|
||||
[nodes]
|
||||
(let [[of-top-level remaining-nodes]
|
||||
(->> nodes (split-with #(of-type? % "property-drawer" "keyword")))
|
||||
[of-first-section remaining-nodes*]
|
||||
(->> remaining-nodes (split-with #(not (of-type? % "section"))))]
|
||||
{:top-level-nodes of-top-level
|
||||
:first-section-nodes of-first-section
|
||||
:rest remaining-nodes*}))
|
||||
|
||||
(defn- element-bounds [& nodes]
|
||||
(reduce (fn [acc {:keys [contents-begin contents-end]}]
|
||||
(if (and (nat-int? contents-begin)
|
||||
(nat-int? contents-end))
|
||||
(-> acc
|
||||
(update
|
||||
:contents-begin
|
||||
#(min (or % Integer/MAX_VALUE) contents-begin))
|
||||
(update
|
||||
:contents-end
|
||||
#(max (or % Integer/MIN_VALUE) contents-end)))
|
||||
acc))
|
||||
{:contents-begin nil
|
||||
:contents-end nil}
|
||||
nodes))
|
||||
|
||||
(defn gather-first-section [node]
|
||||
(assert (of-type? node "org-data")
|
||||
"`gather-doerg-data` should be applied to the document root.")
|
||||
(let [{:keys [top-level-nodes first-section-nodes rest]}
|
||||
(split-sections (:children node))
|
||||
;; TODO: Construct `:contents-begin` and `:contents-end` data
|
||||
;; by spanning the children.
|
||||
first-section (merge {:type "section"
|
||||
:children (vec first-section-nodes)}
|
||||
(apply element-bounds first-section-nodes))
|
||||
new-children (vec (concat top-level-nodes
|
||||
(list first-section)
|
||||
rest))]
|
||||
(assoc node :children new-children)))
|
||||
|
||||
(defn- newline-final-paragraph?
|
||||
"Is `e` a paragraph, and does it end with a newline?"
|
||||
[e]
|
||||
(and (of-type? e "paragraph")
|
||||
(some-> (-> e :position :end :column)
|
||||
(= 1))))
|
||||
|
||||
(defn consequtive-elements?
|
||||
"Returh truthy if each successive pair of elements is NOT separated
|
||||
by at least one explicit paragraph break; i.e. a blank line."
|
||||
[& elements]
|
||||
(match elements
|
||||
([(e₁ :guard newline-final-paragraph?) e₂ & es] :seq)
|
||||
(and (= (-> e₁ :position :end :line)
|
||||
(-> e₂ :position :start :line))
|
||||
(recur es))
|
||||
([e₁ e₂ & es] :seq)
|
||||
(and (= (-> e₁ :position :end :line inc)
|
||||
(-> e₂ :position :start :line))
|
||||
(recur es))
|
||||
([_] :seq) true
|
||||
([] :seq) true))
|
||||
|
||||
(defn swallow
|
||||
([predator prey]
|
||||
(assert (greater-element? predator))
|
||||
(-> predator
|
||||
(update :children #(conj % prey))
|
||||
(assoc-in [:position :end] (-> prey :position :end))))
|
||||
([predator prey & more-prey]
|
||||
(reduce swallow predator (cons prey more-prey))))
|
||||
|
||||
(defn- paragraph-followed-by-tex? [children]
|
||||
(match children
|
||||
[(para :guard #(of-type? % "paragraph"))
|
||||
(tex :guard #(of-type? % "latex-environment"))
|
||||
& _]
|
||||
(consequtive-elements? para tex)
|
||||
:else false))
|
||||
|
||||
(defn- paragraph-followed-by-paragraph? [children]
|
||||
(match children
|
||||
[(para₁ :guard #(of-type? % "paragraph"))
|
||||
(para₂ :guard #(of-type? % "paragraph"))
|
||||
& _]
|
||||
(consequtive-elements? para₁ para₂)
|
||||
:else false))
|
||||
|
||||
(defn gather-latex-paragraphs [node]
|
||||
(->> node
|
||||
(sp/transform
|
||||
[postorder-walker (sp/must :children)]
|
||||
(fn [children]
|
||||
(loop [acc []
|
||||
cs (vec children)]
|
||||
(match cs
|
||||
;; CASE: A paragraph followed by a LaTeX environment.
|
||||
;; If there are no blank lines separating the paragraph
|
||||
;; from the LaTeX environment, the LaTeX environment
|
||||
;; shall become a child of the paragraph.
|
||||
([para tex & rest] :guard paragraph-followed-by-tex?)
|
||||
(recur acc (vec (cons (swallow para tex) rest)))
|
||||
;; CASE: Similar to the paragraph-followed-by-tex case,
|
||||
;; but instead of swallowing the entire second element,
|
||||
;; we swallow the /children/ of the second element,
|
||||
;; since paragraphs cannot be nested.
|
||||
([para₁ para₂ & rest]
|
||||
:guard paragraph-followed-by-paragraph?)
|
||||
(recur acc (vec (cons (apply swallow para₁ (:children para₂))
|
||||
rest)))
|
||||
;; CASE: Irrelevant or empty!
|
||||
[c & rest]
|
||||
(recur (conj acc c) rest)
|
||||
[] acc))))))
|
||||
|
||||
Reference in New Issue
Block a user