]> Git — Sourcephile - doclang.git/blob - Language/TCT/Tree.hs
Fix XML rendition of PairFrenchquote.
[doclang.git] / Language / TCT / Tree.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
4 module Language.TCT.Tree
5 ( module Language.TCT.Tree
6 , Tree(..), Trees
7 ) where
8
9 import Control.Monad (Monad(..))
10 import Data.Bool
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
16 import Data.Int (Int)
17 import Data.List.NonEmpty (NonEmpty(..))
18 import Data.Monoid (Monoid(..))
19 import Data.Ord (Ordering(..), Ord(..))
20 import Data.Semigroup (Semigroup(..))
21 import Data.Sequence ((|>))
22 import Data.TreeSeq.Strict (Tree(..), Trees)
23 import Prelude (undefined, Num(..))
24 import System.FilePath (FilePath)
25 import Text.Show (Show(..))
26 import qualified Data.List as List
27 import qualified Data.Sequence as Seq
28 import qualified Data.Text.Lazy as TL
29
30 import Language.TCT.Utils
31 import Language.TCT.Cell
32 import Language.TCT.Elem
33 import Language.TCT.Debug
34
35 -- * Type 'Root'
36 -- | A single 'Tree' to rule all the 'Node's
37 -- simplifies greatly the navigation and transformations,
38 -- especially because the later XML or DTC output
39 -- are themselves a single tree-like data structure.
40 --
41 -- Also, having a single 'Tree' is easier to merge
42 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
43 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
44 --
45 -- For error reporting, each 'Node' is annotated with a 'Cell'
46 -- spanning over all its content (sub-'Trees' included).
47 type Root = Tree (Cell Node)
48 type Roots = Trees (Cell Node)
49
50 pattern Tree0 :: a -> Tree a
51 pattern Tree0 a <- Tree a (null -> True)
52 where Tree0 a = Tree a mempty
53
54 -- * Type 'Node'
55 data Node
56 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
57 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
58 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
59 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
60 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
61 | NodePara -- ^ node, gather trees by paragraph,
62 -- useful to know when to generate a <para> XML node
63 deriving (Eq,Show)
64 instance Pretty Node
65
66 -- * Type 'Header'
67 data Header
68 = HeaderColon !Name !White -- ^ @name: @
69 | HeaderEqual !Name !White -- ^ @name=@
70 | HeaderBar !Name !White -- ^ @name|@
71 | HeaderGreat !Name !White -- ^ @name>@
72 | HeaderBrackets !Name -- ^ @[name]@
73 | HeaderDot !Name -- ^ @1. @
74 | HeaderDash -- ^ @- @
75 | HeaderDashDash -- ^ @-- @
76 | HeaderSection !LevelSection -- ^ @# @
77 | HeaderDotSlash !FilePath -- ^ @./file @
78 deriving (Eq, Ord, Show)
79 instance Pretty Header
80
81 -- ** Type 'Name'
82 type Name = TL.Text
83
84 -- ** Type 'LevelSection'
85 type LevelSection = Int
86
87 -- * Type 'Pair'
88 data Pair
89 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
90 | PairHash -- ^ @\#text#@
91 | PairStar -- ^ @*text*@
92 | PairSlash -- ^ @/text/@
93 | PairUnderscore -- ^ @_value_@
94 | PairDash -- ^ @-text-@
95 | PairBackquote -- ^ @`text`@
96 | PairSinglequote -- ^ @'text'@
97 | PairDoublequote -- ^ @"text"@
98 | PairFrenchquote -- ^ @«text»@
99 | PairParen -- ^ @(text)@
100 | PairBrace -- ^ @{text}@
101 | PairBracket -- ^ @[text]@
102 deriving (Eq,Ord,Show)
103 instance Pretty Pair
104
105 -- * Type 'Token'
106 data Token
107 = TokenText !TL.Text
108 | TokenEscape !Char
109 | TokenLink !Link
110 | TokenTag !Tag
111 deriving (Eq,Show)
112
113 -- ** Type 'Tag'
114 type Tag = TL.Text
115
116 -- ** Type 'Link'
117 type Link = TL.Text
118
119 -- * Type 'Row'
120 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
121 type Row = [Root]
122
123 -- ** Type 'Rows'
124 -- | In reverse order: a list of nodes in scope
125 -- (hence to which the next line can append to).
126 type Rows = [Root]
127
128 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
129 -- one can always put the last 'Root' as a child to a previous one.
130 -- This 'Root' just has to be discarded by 'collapseRows'.
131 initRows :: Rows
132 initRows = [Tree0 $ Cell (Span "" p p :| []) $ NodeHeader HeaderDash]
133 where p = Pos{pos_line= -1, pos_column=0}
134 -- NOTE: such that any following 'Root'
135 -- is 'NodePara' if possible, and always a child.
136
137 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
138 --
139 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
140 -- they MAY span over multilines, and they can be many from a single line.
141 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
142 -- they MUST span only over a single and entire line.
143 --
144 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
145 mergeRow :: Rows -> Row -> Rows
146 mergeRow rows row =
147 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
148 mergeRowPrefix 0 rows $ List.reverse row
149
150 -- | Merge by considering matching prefixes.
151 --
152 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
153 -- but on their vertical alignment as prefixes.
154 -- Hence, each new 'Row' has those prefixes zipped into a single one
155 -- when they match, are aligned and adjacent.
156 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
157 mergeRowPrefix col rows row =
158 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
159 case (row,rows) of
160 ([], _) -> rows
161 (_, []) -> undefined -- NOTE: cannot happen with initRows
162 ( _new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news
163 , _old@(Tree (Cell (Span _fo _bo eo:|_so) _o) _os):_olds ) ->
164 case collapseRowsWhile isCollapsable rows of
165 [] -> mergeRowIndent rows row
166 head@(unTree -> ch@(Cell (Span _fh bh _eh:|_sh) h)) : olds' ->
167 case (n,h) of
168 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
169 -- then check if there is a matching HeaderGreat,
170 -- if so, discard new and restart with a col advanced to new's beginning
171 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
172 | isAdjacent && isMatching ch -> discard
173 -- NOTE: same for HeaderBar
174 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
175 | isAdjacent && isMatching ch -> discard
176 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
177 -- makes it collapse entirely
178 (_, NodeHeader HeaderGreat{})
179 | col < pos_column bh -> collapse
180 -- NOTE: same for HeaderBar
181 (_, NodeHeader HeaderBar{})
182 | col < pos_column bh -> collapse
183 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
184 where
185 isAdjacent = pos_line bn - pos_line eo <= 1
186 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
187 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
188 where
189 isMatching (Cell (Span _fh bh _eh:|_sh) h) =
190 pos_column bn == pos_column bh &&
191 n == h
192 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
193 \_t0@(unTree -> c0@(Cell (Span _f0 b0 _e0:|_s0) _n0))
194 _t1@(unTree -> Cell (Span _f1 b1 e1:|_s1) _n1) ->
195 not (isMatching c0) &&
196 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
197 col < pos_column b1 -- righter than col
198
199 -- | Merge by considering indentation.
200 mergeRowIndent :: Rows -> Row -> Rows
201 mergeRowIndent rows row =
202 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
203 case (row,rows) of
204 ([], _) -> rows
205 (_, []) -> undefined -- NOTE: cannot happen with initRows
206 ( new@(Tree (Cell ssn@(Span fn bn en:|sn) n) ns):news
207 ,old@(Tree (Cell sso@(Span fo bo eo:|so) o) os):olds ) ->
208 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
209 debug0 "mergeRowIndent/colOld" (pos_column bo) of
210 -- NOTE: new is on the left
211 LT ->
212 case (n,o) of
213 -- NOTE: merge adjacent NodeText
214 -- first
215 -- second
216 (NodeText tn, NodeText to)
217 | TL.null tn || TL.null to
218 , not isVerbatim -> collapse
219 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
220 where
221 t = NodeText <$> Cell (Span fo boNew eo:|so) (indent<>to) <> Cell ssn tn
222 boNew = bo{pos_column=pos_column bn}
223 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
224 -- | Whether the horizontal delta is made of spaces
225 isIndented =
226 debug0 "mergeRowIndent/isIndented" $
227 case olds of
228 [] -> True
229 (unTree -> (cell_spans -> (span_end -> ep) :| _)) : _ ->
230 case pos_line ep `compare` pos_line bo of
231 LT -> True
232 EQ -> pos_column ep <= pos_column bn
233 _ -> False
234 _ -> collapse
235 -- NOTE: new is vertically aligned
236 EQ ->
237 case (n,o) of
238 -- NOTE: preserve all NodeText "", but still split into two NodePara
239 (NodeText tn, NodeText to)
240 | TL.null tn || TL.null to
241 , not isVerbatim -> collapse
242 | isAdjacent -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
243 -- NOTE: HeaderSection can parent Nodes at the same level
244 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
245 if debug0 "mergeRowIndent/lvlNew" lvlNew
246 > debug0 "mergeRowIndent/lvlOld" lvlOld
247 -- # old
248 -- ## new
249 then concat
250 -- ## old or # old
251 -- # new # new
252 else collapse
253 -- NOTE: old is no HeaderSection, then collapse to any older and loop
254 (NodeHeader HeaderSection{}, _)
255 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
256 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
257 mergeRowIndent rows' row
258 where
259 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
260 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_ss0) n0) _t1 ->
261 case n0 of
262 NodeHeader HeaderSection{} -> False
263 _ -> pos_column bn == pos_column b0
264 -- NOTE: in case of alignment, HeaderSection is parent
265 (_, NodeHeader HeaderSection{}) -> concat
266 --
267 _ -> replace
268 -- NOTE: new is on the right
269 GT ->
270 case (n,o) of
271 -- NOTE: keep NodeText "" out of old NodePara
272 (NodeText "", NodePara) -> collapse
273 -- NOTE: merge adjacent NodeText
274 (NodeText tn, NodeText to) ->
275 case isAdjacent of
276 _ | TL.null tn || TL.null to
277 , not isVerbatim -> collapse
278 -- old
279 -- new
280 True -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
281 -- old
282 --
283 -- new
284 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
285 where
286 shifted = Tree (Cell (Span fn bnNew en:|sn) $ NodeText $ indent<>tn) (os<>ns)
287 bnNew = bn{pos_column=pos_column bo}
288 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
289 --
290 _ -> concat
291 where
292 isAdjacent = pos_line bn - pos_line eo <= 1
293 -- | Whether a parent semantic want new to stay a NodeText
294 isVerbatim = any p rows
295 where
296 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
297 p _ = False
298 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
299 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
300 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
301 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
302
303 -- | Like 'mergeRowIndent', but without maintaining the appending,
304 -- hence collapsing all the 'Root's of the given 'Rows'.
305 --
306 -- NOTE: 'initRows' MUST have been the first 'Rows'
307 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
308 collapseRows :: Rows -> Roots
309 collapseRows rows =
310 debug1_ "collapseRows" ("rows",rows) $
311 case collapseRowsWhile (\_new _old -> True) rows of
312 [t] -> subTrees t
313 _ -> undefined
314 -- NOTE: subTrees returns the children of the updated initRows
315
316 -- | Collapse downto any last HeaderSection, returning it and its level.
317 collapseSection :: ColNum -> Rows -> Rows
318 collapseSection col = debug1 "collapseSection" "rows" go
319 where
320 go rows@(new@(unTree -> Cell (Span _fn bn _en:|_sn) n):olds)
321 | col <= pos_column bn =
322 case n of
323 NodeHeader HeaderSection{} -> rows
324 _ -> collapseSection col $ collapseRoot new $ go olds
325 go _ = mempty
326
327 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
328 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
329 [] -> mempty
330 rows@(new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news) ->
331 case news of
332 [] -> rows
333 old@(Tree (Cell (Span _fo bo eo:|_so) o) _os):olds
334 | not $ test new old -> rows
335 | otherwise ->
336 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
337 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
338 -- NOTE: new is vertically aligned
339 EQ ->
340 case (n,o) of
341 -- NOTE: HeaderSection can parent Nodes at the same level
342 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
343 if debug0 "collapseRowsWhile/lvlNew" lvlNew
344 > debug0 "collapseRowsWhile/lvlOld" lvlOld
345 -- # old
346 -- ## new
347 then collapse
348 -- ## old or # old
349 -- # new # new
350 else
351 debug "collapseRowsWhile/replace" $
352 collapseRowsWhile test $ (new:) $ collapseRoot old olds
353 -- NOTE: old is no HeaderSection, then collapse to any older and loop
354 (NodeHeader HeaderSection{}, _)
355 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
356 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
357 collapseRowsWhile test news'
358 where
359 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
360 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_s0) n0) _t1 ->
361 case n0 of
362 NodeHeader HeaderSection{} -> False
363 _ -> pos_column bn == pos_column b0
364 -- NOTE: in case of alignment, HeaderSection is parent
365 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
366 -- NOTE: merge within old NodePara.
367 (_, NodePara) | isAdjacent -> collapse
368 --
369 _ -> collapse2
370 -- NOTE: new is either on the left or on the right
371 _ -> collapse
372 where
373 isAdjacent = pos_line bn - pos_line eo <= 1
374 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
375 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
376
377 -- | Put a 'Root' as a child of the head 'Root'.
378 --
379 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
380 --
381 -- NOTE: any NodeText/NodeText merging must have been done before.
382 collapseRoot :: Root -> Rows -> Rows
383 collapseRoot new@(Tree (Cell ssn@(Span _fn bn en:|_sn) n) _ns) rows =
384 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
385 case rows of
386 [] -> return new
387 old@(Tree (Cell (Span fo bo eo:|so) o) os) : olds ->
388 case (n,o) of
389 -- NOTE: no child into NodeText
390 (_, NodeText{}) -> collapse2
391 -- NOTE: NodeText can begin a NodePara
392 (NodeText tn, _) | not $ TL.null tn ->
393 case o of
394 -- NOTE: no NodePara within those
395 NodeHeader HeaderEqual{} -> collapse
396 NodeHeader HeaderBar{} -> collapse
397 NodeHeader HeaderDashDash{} -> collapse
398 -- NOTE: NodePara within those
399 NodePara | not isAdjacent -> para
400 NodeHeader{} -> para
401 _ -> collapse
402 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
403 -- Note that since a NodePara is never adjacent to another,
404 -- it is not nested within another.
405 (_, NodePara)
406 | isAdjacent ->
407 case n of
408 -- NOTE: no HeaderSection (even adjacent) within a NodePara
409 NodeHeader HeaderSection{} -> collapse2
410 _ -> collapse
411 | otherwise -> collapse2
412 _ -> collapse
413 where
414 isAdjacent = pos_line bn - pos_line eo <= 1
415 para = Tree (Cell ssn NodePara) (return new) : rows
416 collapse = Tree (Cell (Span fo bo en:|so) o) (os |> new) : olds
417 collapse2 = collapseRoot new $ collapseRoot old olds