]> Git — Sourcephile - doclang.git/blob - Language/TCT/Tree.hs
Add golden tests for DTC.
[doclang.git] / Language / TCT / Tree.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
4 module Language.TCT.Tree
5 ( module Language.TCT.Tree
6 , Tree(..), Trees
7 ) where
8
9 import Control.Monad (Monad(..))
10 import Data.Bool
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
16 import Data.Int (Int)
17 import Data.Monoid (Monoid(..))
18 import Data.Ord (Ordering(..), Ord(..))
19 import Data.Semigroup (Semigroup(..))
20 import Data.Sequence ((|>))
21 import Data.TreeSeq.Strict (Tree(..), Trees)
22 import Prelude (undefined, Num(..))
23 import System.FilePath (FilePath)
24 import Text.Show (Show(..))
25 import qualified Data.List as List
26 import qualified Data.Sequence as Seq
27 import qualified Data.Text.Lazy as TL
28
29 import Language.TCT.Utils
30 import Language.TCT.Cell
31 import Language.TCT.Elem
32 import Language.TCT.Debug
33
34 -- * Type 'Root'
35 -- | A single 'Tree' to rule all the 'Node's
36 -- simplifies greatly the navigation and transformations,
37 -- especially because the later XML or DTC output
38 -- are themselves a single tree-like data structure.
39 --
40 -- Also, having a single 'Tree' is easier to merge
41 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
42 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
43 --
44 -- For error reporting, each 'Node' is annotated with a 'Cell'
45 -- spanning over all its content (sub-'Trees' included).
46 type Root = Tree (Cell Node)
47 type Roots = Trees (Cell Node)
48
49 pattern Tree0 :: a -> Tree a
50 pattern Tree0 a <- Tree a (null -> True)
51 where Tree0 a = Tree a mempty
52
53 -- * Type 'Node'
54 data Node
55 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
56 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
57 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
58 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
59 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
60 | NodePara -- ^ node, gather trees by paragraph,
61 -- useful to know when to generate a <para> XML node
62 | NodeGroup -- ^ node, group trees into a single tree,
63 -- useful to return many trees when only one is expected
64 deriving (Eq,Show)
65 instance Pretty Node
66
67 -- * Type 'Header'
68 data Header
69 = HeaderColon !Name !White -- ^ @name: @
70 | HeaderEqual !Name !White -- ^ @name=@
71 | HeaderBar !Name !White -- ^ @name|@
72 | HeaderGreat !Name !White -- ^ @name>@
73 | HeaderBrackets !Name -- ^ @[name]@
74 | HeaderDot !Name -- ^ @1. @
75 | HeaderDash -- ^ @- @
76 | HeaderDashDash -- ^ @-- @
77 | HeaderSection !LevelSection -- ^ @# @
78 | HeaderDotSlash !FilePath -- ^ @./file @
79 deriving (Eq, Ord, Show)
80 instance Pretty Header
81
82 -- ** Type 'Name'
83 type Name = TL.Text
84
85 -- ** Type 'LevelSection'
86 type LevelSection = Int
87
88 -- * Type 'Pair'
89 data Pair
90 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
91 | PairHash -- ^ @\#text#@
92 | PairStar -- ^ @*text*@
93 | PairSlash -- ^ @/text/@
94 | PairUnderscore -- ^ @_value_@
95 | PairDash -- ^ @-text-@
96 | PairBackquote -- ^ @`text`@
97 | PairSinglequote -- ^ @'text'@
98 | PairDoublequote -- ^ @"text"@
99 | PairFrenchquote -- ^ @«text»@
100 | PairParen -- ^ @(text)@
101 | PairBrace -- ^ @{text}@
102 | PairBracket -- ^ @[text]@
103 deriving (Eq,Ord,Show)
104 instance Pretty Pair
105
106 -- * Type 'Token'
107 data Token
108 = TokenText !TL.Text
109 | TokenEscape !Char
110 | TokenLink !Link
111 | TokenTag !Tag
112 deriving (Eq,Show)
113
114 -- ** Type 'Tag'
115 type Tag = TL.Text
116
117 -- ** Type 'Link'
118 type Link = TL.Text
119
120 -- * Type 'Row'
121 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
122 type Row = [Root]
123
124 -- ** Type 'Rows'
125 -- | In reverse order: a list of nodes in scope
126 -- (hence to which the next line can append to).
127 type Rows = [Root]
128
129 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
130 -- one can always put the last 'Root' as a child to a previous one.
131 -- This 'Root' just has to be discarded by 'collapseRows'.
132 initRows :: Rows
133 initRows = [Tree0 (Cell p p NodeGroup)]
134 where p = pos1{pos_line= -1, pos_column=0}
135 -- NOTE: such that any following 'Root'
136 -- is 'NodePara' if possible, and always a child.
137
138 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
139 --
140 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
141 -- they MAY span over multilines, and they can be many from a single line.
142 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
143 -- they MUST span only over a single and entire line.
144 --
145 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
146 mergeRow :: Rows -> Row -> Rows
147 mergeRow rows row =
148 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
149 mergeRowPrefix 0 rows $ List.reverse row
150
151 -- | Merge by considering matching prefixes.
152 --
153 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
154 -- but on their vertical alignment as prefixes.
155 -- Hence, each new 'Row' has those prefixes zipped into a single one
156 -- when they match, are aligned and adjacent.
157 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
158 mergeRowPrefix col rows row =
159 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
160 case (row,rows) of
161 ([], _) -> rows
162 (_, []) -> undefined -- NOTE: cannot happen with initRows
163 ( _new@(Tree (Cell bn _en n) _ns):news
164 , _old@(Tree (Cell _bo eo _o) _os):_olds ) ->
165 case collapseRowsWhile isCollapsable rows of
166 [] -> mergeRowIndent rows row
167 head@(unTree -> ch@(Cell bh _eh h)) : olds' ->
168 case (n,h) of
169 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
170 -- then check if there is a matching HeaderGreat,
171 -- if so, discard new and restart with a col advanced to new's beginning
172 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
173 | isAdjacent && isMatching ch -> discard
174 {-
175 | pos_column bn == pos_column bh
176 , isAdjacent
177 , hn == hh
178 -}
179 -- NOTE: same for HeaderBar
180 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
181 | isAdjacent && isMatching ch -> discard
182 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
183 -- makes it collapse entirely
184 (_, NodeHeader HeaderGreat{})
185 | col < pos_column bh -> collapse
186 -- NOTE: same for HeaderBar
187 (_, NodeHeader HeaderBar{})
188 | col < pos_column bh -> collapse
189 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
190 where
191 isAdjacent = pos_line bn - pos_line eo <= 1
192 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
193 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
194 where
195 isMatching (Cell bh _eh h) =
196 pos_column bn == pos_column bh &&
197 n == h
198 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
199 \_t0@(unTree -> c0@(Cell b0 _e0 _n0)) _t1@(unTree -> Cell b1 e1 _n1) ->
200 not (isMatching c0) &&
201 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
202 col < pos_column b1 -- righter than col
203
204 -- | Merge by considering indentation.
205 mergeRowIndent :: Rows -> Row -> Rows
206 mergeRowIndent rows row =
207 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
208 case (row,rows) of
209 ([], _) -> rows
210 (_, []) -> undefined -- NOTE: cannot happen with initRows
211 ( new@(Tree (Cell bn en n) ns):news
212 ,old@(Tree (Cell bo eo o) os):olds ) ->
213 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
214 debug0 "mergeRowIndent/colOld" (pos_column bo) of
215 -- NOTE: new is on the left
216 LT ->
217 case (n,o) of
218 -- NOTE: merge adjacent NodeText
219 -- first
220 -- second
221 (NodeText tn, NodeText to)
222 | TL.null tn || TL.null to
223 , not isVerbatim -> collapse
224 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
225 where
226 t = NodeText <$> Cell boNew eo (indent<>to) <> Cell bn en tn
227 boNew = bo{pos_column=pos_column bn}
228 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
229 -- | Whether the horizontal delta is made of spaces
230 isIndented =
231 debug0 "mergeRowIndent/isIndented" $
232 case olds of
233 [] -> True
234 (unTree -> (cell_end -> ep)) : _ ->
235 case pos_line ep `compare` pos_line bo of
236 LT -> True
237 EQ -> pos_column ep <= pos_column bn
238 _ -> False
239 _ -> collapse
240 -- NOTE: new is vertically aligned
241 EQ ->
242 case (n,o) of
243 -- NOTE: preserve all NodeText "", but still split into two NodePara
244 (NodeText tn, NodeText to)
245 | TL.null tn || TL.null to
246 , not isVerbatim -> collapse
247 | isAdjacent -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
248 -- NOTE: HeaderSection can parent Nodes at the same level
249 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
250 if debug0 "mergeRowIndent/lvlNew" lvlNew
251 > debug0 "mergeRowIndent/lvlOld" lvlOld
252 -- # old
253 -- ## new
254 then concat
255 -- ## old or # old
256 -- # new # new
257 else collapse
258 -- NOTE: old is no HeaderSection, then collapse to any older and loop
259 (NodeHeader HeaderSection{}, _)
260 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
261 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
262 mergeRowIndent rows' row
263 where
264 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
265 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
266 case n0 of
267 NodeHeader HeaderSection{} -> False
268 _ -> pos_column bn == pos_column b0
269 -- NOTE: in case of alignment, HeaderSection is parent
270 (_, NodeHeader HeaderSection{}) -> concat
271 --
272 _ -> replace
273 -- NOTE: new is on the right
274 GT ->
275 case (n,o) of
276 -- NOTE: only same line Root can be pushed on HeaderBar
277 -- DELME: (_, NodeHeader HeaderBar{}) | pos_column bn /= pos_column eo -> collapse
278 -- NOTE: keep NodeText "" out of old NodePara
279 (NodeText "", NodePara) -> collapse
280 -- NOTE: merge adjacent NodeText
281 (NodeText tn, NodeText to) ->
282 case isAdjacent of
283 _ | TL.null tn || TL.null to
284 , not isVerbatim -> collapse
285 -- old
286 -- new
287 True -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
288 -- old
289 --
290 -- new
291 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
292 where
293 shifted = Tree (Cell bnNew en $ NodeText $ indent<>tn) (os<>ns)
294 bnNew = bn{pos_column=pos_column bo}
295 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
296 --
297 _ -> concat
298 where
299 isAdjacent = pos_line bn - pos_line eo <= 1
300 -- | Whether a parent semantic want new to stay a NodeText
301 isVerbatim = any p rows
302 where
303 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
304 p _ = False
305 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
306 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
307 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
308 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
309
310 -- | Like 'mergeRowIndent', but without maintaining the appending,
311 -- hence collapsing all the 'Root's of the given 'Rows'.
312 --
313 -- NOTE: 'initRows' MUST have been the first 'Rows'
314 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
315 collapseRows :: Rows -> Roots
316 collapseRows rows =
317 debug1_ "collapseRows" ("rows",rows) $
318 case collapseRowsWhile (\_new _old -> True) rows of
319 [t] -> subTrees t
320 _ -> undefined
321 -- NOTE: subTrees returns the children of the updated initRows
322
323 -- | Collapse downto any last HeaderSection, returning it and its level.
324 collapseSection :: ColNum -> Rows -> Rows
325 collapseSection col = debug1 "collapseSection" "rows" go
326 where
327 go rows@(new@(unTree -> Cell bn _en n):olds)
328 | col <= pos_column bn =
329 case n of
330 NodeHeader HeaderSection{} -> rows
331 _ -> collapseSection col $ collapseRoot new $ go olds
332 go _ = mempty
333
334 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
335 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
336 [] -> mempty
337 rows@(new@(Tree (Cell bn _en n) _ns):news) ->
338 case news of
339 [] -> rows
340 old@(Tree (Cell bo eo o) _os):olds
341 | not $ test new old -> rows
342 | otherwise ->
343 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
344 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
345 -- NOTE: new is vertically aligned
346 EQ ->
347 case (n,o) of
348 -- NOTE: HeaderSection can parent Nodes at the same level
349 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
350 if debug0 "collapseRowsWhile/lvlNew" lvlNew
351 > debug0 "collapseRowsWhile/lvlOld" lvlOld
352 -- # old
353 -- ## new
354 then collapse
355 -- ## old or # old
356 -- # new # new
357 else
358 debug "collapseRowsWhile/replace" $
359 collapseRowsWhile test $ (new:) $ collapseRoot old olds
360 -- NOTE: old is no HeaderSection, then collapse to any older and loop
361 (NodeHeader HeaderSection{}, _)
362 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
363 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
364 collapseRowsWhile test news'
365 where
366 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
367 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
368 case n0 of
369 NodeHeader HeaderSection{} -> False
370 _ -> pos_column bn == pos_column b0
371 -- NOTE: in case of alignment, HeaderSection is parent
372 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
373 -- NOTE: merge within old NodePara.
374 (_, NodePara) | isAdjacent -> collapse
375 --
376 _ -> collapse2
377 -- NOTE: new is either on the left or on the right
378 _ -> collapse
379 where
380 isAdjacent = pos_line bn - pos_line eo <= 1
381 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
382 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
383
384 -- | Put a 'Root' as a child of the head 'Root'.
385 --
386 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
387 --
388 -- NOTE: any NodeText/NodeText merging must have been done before.
389 collapseRoot :: Root -> Rows -> Rows
390 collapseRoot new@(Tree (Cell bn en n) _ns) rows =
391 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
392 case rows of
393 [] -> return new
394 old@(Tree (Cell bo eo o) os) : olds ->
395 case (n,o) of
396 -- NOTE: no child into NodeText
397 (_, NodeText{}) -> collapse2
398 -- NOTE: NodeText can begin a NodePara
399 (NodeText tn, _) | not $ TL.null tn ->
400 case o of
401 -- NOTE: no NodePara within those
402 NodeHeader HeaderEqual{} -> collapse
403 NodeHeader HeaderBar{} -> collapse
404 NodeHeader HeaderDashDash{} -> collapse
405 -- NOTE: NodePara within those
406 NodePara | not isAdjacent -> para
407 NodeHeader{} -> para
408 NodeGroup -> para
409 _ -> collapse
410 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
411 -- Note that since a NodePara is never adjacent to another,
412 -- it is not nested within another.
413 (_, NodePara)
414 | isAdjacent ->
415 case n of
416 -- NOTE: no HeaderSection (even adjacent) within a NodePara
417 NodeHeader HeaderSection{} -> collapse2
418 _ -> collapse
419 | otherwise -> collapse2
420 _ -> collapse
421 where
422 isAdjacent = pos_line bn - pos_line eo <= 1
423 para = Tree (Cell bn en NodePara) (return new) : rows
424 collapse = Tree (Cell bo en o) (os |> new) : olds
425 collapse2 = collapseRoot new $ collapseRoot old olds