]> Git — Sourcephile - doclang.git/blob - Language/TCT/Tree.hs
Sync HTML5 rendition of DTC with new TCT parsing.
[doclang.git] / Language / TCT / Tree.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
4 module Language.TCT.Tree
5 ( module Language.TCT.Tree
6 , Tree(..), Trees
7 ) where
8
9 import Control.Monad (Monad(..))
10 import Data.Bool
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
16 import Data.Int (Int)
17 import Data.Monoid (Monoid(..))
18 import Data.Ord (Ordering(..), Ord(..))
19 import Data.Semigroup (Semigroup(..))
20 import Data.Sequence ((|>))
21 import Data.TreeSeq.Strict (Tree(..), Trees)
22 import Prelude (undefined, Num(..))
23 import System.FilePath (FilePath)
24 import Text.Show (Show(..))
25 import qualified Data.List as List
26 import qualified Data.Sequence as Seq
27 import qualified Data.Text.Lazy as TL
28
29 import Language.TCT.Utils
30 import Language.TCT.Cell
31 import Language.TCT.Elem
32 import Language.TCT.Debug
33
34 -- * Type 'Root'
35 -- | A single 'Tree' to rule all the 'Node's
36 -- simplifies greatly the navigation and transformations,
37 -- especially because the later XML or DTC output
38 -- are themselves a single tree-like data structure.
39 --
40 -- Also, having a single 'Tree' is easier to merge
41 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
42 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
43 --
44 -- For error reporting, each 'Node' is annotated with a 'Cell'
45 -- spanning over all its content (sub-'Trees' included).
46 type Root = Tree (Cell Node)
47 type Roots = Trees (Cell Node)
48
49 pattern Tree0 :: a -> Tree a
50 pattern Tree0 a <- Tree a (null -> True)
51 where Tree0 a = Tree a mempty
52
53 -- * Type 'Node'
54 data Node
55 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
56 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
57 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
58 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
59 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
60 | NodePara -- ^ node, gather trees by paragraph,
61 -- useful to know when to generate a <para> XML node
62 | NodeGroup -- ^ node, group trees into a single tree,
63 -- useful to return many trees when only one is expected
64 deriving (Eq,Show)
65 instance Pretty Node
66
67 -- * Type 'Header'
68 data Header
69 = HeaderColon !Name !White -- ^ @name: @
70 | HeaderEqual !Name !White -- ^ @name=@
71 | HeaderBar !Name !White -- ^ @name|@
72 | HeaderGreat !Name !White -- ^ @name>@
73 | HeaderBrackets !Name -- ^ @[name]@
74 | HeaderDot !Name -- ^ @1. @
75 | HeaderDash -- ^ @- @
76 | HeaderDashDash -- ^ @-- @
77 | HeaderSection !LevelSection -- ^ @# @
78 | HeaderDotSlash !FilePath -- ^ @./file @
79 deriving (Eq, Ord, Show)
80 instance Pretty Header
81
82 -- ** Type 'Name'
83 type Name = TL.Text
84
85 -- ** Type 'LevelSection'
86 type LevelSection = Int
87
88 -- * Type 'Pair'
89 data Pair
90 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
91 | PairHash -- ^ @\#text#@
92 | PairStar -- ^ @*text*@
93 | PairSlash -- ^ @/text/@
94 | PairUnderscore -- ^ @_value_@
95 | PairDash -- ^ @-text-@
96 | PairBackquote -- ^ @`text`@
97 | PairSinglequote -- ^ @'text'@
98 | PairDoublequote -- ^ @"text"@
99 | PairFrenchquote -- ^ @«text»@
100 | PairParen -- ^ @(text)@
101 | PairBrace -- ^ @{text}@
102 | PairBracket -- ^ @[text]@
103 deriving (Eq,Ord,Show)
104 instance Pretty Pair
105
106 -- * Type 'Token'
107 data Token
108 = TokenText !TL.Text
109 | TokenEscape !Char
110 | TokenLink !Link
111 | TokenTag !Tag
112 deriving (Eq,Show)
113
114 -- ** Type 'Tag'
115 type Tag = TL.Text
116
117 -- ** Type 'Link'
118 type Link = TL.Text
119
120 -- * Type 'Row'
121 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
122 type Row = [Root]
123
124 -- ** Type 'Rows'
125 -- | In reverse order: a list of nodes in scope
126 -- (hence to which the next line can append to).
127 type Rows = [Root]
128
129 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
130 -- one can always put the last 'Root' as a child to a previous one.
131 -- This 'Root' just has to be discarded by 'collapseRows'.
132 initRows :: Rows
133 initRows = [Tree0 (Cell p p NodeGroup)]
134 where p = pos1{pos_line= -1, pos_column=0}
135 -- NOTE: such that any following 'Root'
136 -- is 'NodePara' if possible, and always a child.
137
138 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
139 --
140 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
141 -- they MAY span over multilines, and they can be many from a single line.
142 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
143 -- they MUST span only over a single and entire line.
144 --
145 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
146 mergeRow :: Rows -> Row -> Rows
147 mergeRow rows row =
148 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
149 mergeRowPrefix 0 rows $ List.reverse row
150
151 -- | Merge by considering matching prefixes.
152 --
153 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
154 -- but on their vertical alignment as prefixes.
155 -- Hence, each new 'Row' has those prefixes zipped into a single one
156 -- when they match, are aligned and adjacent.
157 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
158 mergeRowPrefix col rows row =
159 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
160 case (row,rows) of
161 ([], _) -> rows
162 (_, []) -> undefined -- NOTE: cannot happen with initRows
163 ( _new@(Tree (Cell bn _en n) _ns):news
164 , _old@(Tree (Cell _bo eo _o) _os):_olds ) ->
165 case collapseRowsWhile isCollapsable rows of
166 [] -> mergeRowIndent rows row
167 head@(unTree -> ch@(Cell bh _eh h)) : olds' ->
168 case (n,h) of
169 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
170 -- then check if there is a matching HeaderGreat,
171 -- if so, discard new and restart with a col advanced to new's beginning
172 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
173 | isAdjacent && isMatching ch -> discard
174 -- NOTE: same for HeaderBar
175 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
176 | isAdjacent && isMatching ch -> discard
177 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
178 -- makes it collapse entirely
179 (_, NodeHeader HeaderGreat{})
180 | col < pos_column bh -> collapse
181 -- NOTE: same for HeaderBar
182 (_, NodeHeader HeaderBar{})
183 | col < pos_column bh -> collapse
184 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
185 where
186 isAdjacent = pos_line bn - pos_line eo <= 1
187 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
188 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
189 where
190 isMatching (Cell bh _eh h) =
191 pos_column bn == pos_column bh &&
192 n == h
193 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
194 \_t0@(unTree -> c0@(Cell b0 _e0 _n0)) _t1@(unTree -> Cell b1 e1 _n1) ->
195 not (isMatching c0) &&
196 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
197 col < pos_column b1 -- righter than col
198
199 -- | Merge by considering indentation.
200 mergeRowIndent :: Rows -> Row -> Rows
201 mergeRowIndent rows row =
202 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
203 case (row,rows) of
204 ([], _) -> rows
205 (_, []) -> undefined -- NOTE: cannot happen with initRows
206 ( new@(Tree (Cell bn en n) ns):news
207 ,old@(Tree (Cell bo eo o) os):olds ) ->
208 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
209 debug0 "mergeRowIndent/colOld" (pos_column bo) of
210 -- NOTE: new is on the left
211 LT ->
212 case (n,o) of
213 -- NOTE: merge adjacent NodeText
214 -- first
215 -- second
216 (NodeText tn, NodeText to)
217 | TL.null tn || TL.null to
218 , not isVerbatim -> collapse
219 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
220 where
221 t = NodeText <$> Cell boNew eo (indent<>to) <> Cell bn en tn
222 boNew = bo{pos_column=pos_column bn}
223 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
224 -- | Whether the horizontal delta is made of spaces
225 isIndented =
226 debug0 "mergeRowIndent/isIndented" $
227 case olds of
228 [] -> True
229 (unTree -> (cell_end -> ep)) : _ ->
230 case pos_line ep `compare` pos_line bo of
231 LT -> True
232 EQ -> pos_column ep <= pos_column bn
233 _ -> False
234 _ -> collapse
235 -- NOTE: new is vertically aligned
236 EQ ->
237 case (n,o) of
238 -- NOTE: preserve all NodeText "", but still split into two NodePara
239 (NodeText tn, NodeText to)
240 | TL.null tn || TL.null to
241 , not isVerbatim -> collapse
242 | isAdjacent -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
243 -- NOTE: HeaderSection can parent Nodes at the same level
244 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
245 if debug0 "mergeRowIndent/lvlNew" lvlNew
246 > debug0 "mergeRowIndent/lvlOld" lvlOld
247 -- # old
248 -- ## new
249 then concat
250 -- ## old or # old
251 -- # new # new
252 else collapse
253 -- NOTE: old is no HeaderSection, then collapse to any older and loop
254 (NodeHeader HeaderSection{}, _)
255 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
256 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
257 mergeRowIndent rows' row
258 where
259 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
260 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
261 case n0 of
262 NodeHeader HeaderSection{} -> False
263 _ -> pos_column bn == pos_column b0
264 -- NOTE: in case of alignment, HeaderSection is parent
265 (_, NodeHeader HeaderSection{}) -> concat
266 --
267 _ -> replace
268 -- NOTE: new is on the right
269 GT ->
270 case (n,o) of
271 -- NOTE: only same line Root can be pushed on HeaderBar
272 -- DELME: (_, NodeHeader HeaderBar{}) | pos_column bn /= pos_column eo -> collapse
273 -- NOTE: keep NodeText "" out of old NodePara
274 (NodeText "", NodePara) -> collapse
275 -- NOTE: merge adjacent NodeText
276 (NodeText tn, NodeText to) ->
277 case isAdjacent of
278 _ | TL.null tn || TL.null to
279 , not isVerbatim -> collapse
280 -- old
281 -- new
282 True -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
283 -- old
284 --
285 -- new
286 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
287 where
288 shifted = Tree (Cell bnNew en $ NodeText $ indent<>tn) (os<>ns)
289 bnNew = bn{pos_column=pos_column bo}
290 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
291 --
292 _ -> concat
293 where
294 isAdjacent = pos_line bn - pos_line eo <= 1
295 -- | Whether a parent semantic want new to stay a NodeText
296 isVerbatim = any p rows
297 where
298 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
299 p _ = False
300 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
301 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
302 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
303 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
304
305 -- | Like 'mergeRowIndent', but without maintaining the appending,
306 -- hence collapsing all the 'Root's of the given 'Rows'.
307 --
308 -- NOTE: 'initRows' MUST have been the first 'Rows'
309 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
310 collapseRows :: Rows -> Roots
311 collapseRows rows =
312 debug1_ "collapseRows" ("rows",rows) $
313 case collapseRowsWhile (\_new _old -> True) rows of
314 [t] -> subTrees t
315 _ -> undefined
316 -- NOTE: subTrees returns the children of the updated initRows
317
318 -- | Collapse downto any last HeaderSection, returning it and its level.
319 collapseSection :: ColNum -> Rows -> Rows
320 collapseSection col = debug1 "collapseSection" "rows" go
321 where
322 go rows@(new@(unTree -> Cell bn _en n):olds)
323 | col <= pos_column bn =
324 case n of
325 NodeHeader HeaderSection{} -> rows
326 _ -> collapseSection col $ collapseRoot new $ go olds
327 go _ = mempty
328
329 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
330 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
331 [] -> mempty
332 rows@(new@(Tree (Cell bn _en n) _ns):news) ->
333 case news of
334 [] -> rows
335 old@(Tree (Cell bo eo o) _os):olds
336 | not $ test new old -> rows
337 | otherwise ->
338 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
339 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
340 -- NOTE: new is vertically aligned
341 EQ ->
342 case (n,o) of
343 -- NOTE: HeaderSection can parent Nodes at the same level
344 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
345 if debug0 "collapseRowsWhile/lvlNew" lvlNew
346 > debug0 "collapseRowsWhile/lvlOld" lvlOld
347 -- # old
348 -- ## new
349 then collapse
350 -- ## old or # old
351 -- # new # new
352 else
353 debug "collapseRowsWhile/replace" $
354 collapseRowsWhile test $ (new:) $ collapseRoot old olds
355 -- NOTE: old is no HeaderSection, then collapse to any older and loop
356 (NodeHeader HeaderSection{}, _)
357 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
358 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
359 collapseRowsWhile test news'
360 where
361 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
362 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
363 case n0 of
364 NodeHeader HeaderSection{} -> False
365 _ -> pos_column bn == pos_column b0
366 -- NOTE: in case of alignment, HeaderSection is parent
367 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
368 -- NOTE: merge within old NodePara.
369 (_, NodePara) | isAdjacent -> collapse
370 --
371 _ -> collapse2
372 -- NOTE: new is either on the left or on the right
373 _ -> collapse
374 where
375 isAdjacent = pos_line bn - pos_line eo <= 1
376 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
377 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
378
379 -- | Put a 'Root' as a child of the head 'Root'.
380 --
381 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
382 --
383 -- NOTE: any NodeText/NodeText merging must have been done before.
384 collapseRoot :: Root -> Rows -> Rows
385 collapseRoot new@(Tree (Cell bn en n) _ns) rows =
386 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
387 case rows of
388 [] -> return new
389 old@(Tree (Cell bo eo o) os) : olds ->
390 case (n,o) of
391 -- NOTE: no child into NodeText
392 (_, NodeText{}) -> collapse2
393 -- NOTE: NodeText can begin a NodePara
394 (NodeText tn, _) | not $ TL.null tn ->
395 case o of
396 -- NOTE: no NodePara within those
397 NodeHeader HeaderEqual{} -> collapse
398 NodeHeader HeaderBar{} -> collapse
399 NodeHeader HeaderDashDash{} -> collapse
400 -- NOTE: NodePara within those
401 NodePara | not isAdjacent -> para
402 NodeHeader{} -> para
403 NodeGroup -> para
404 _ -> collapse
405 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
406 -- Note that since a NodePara is never adjacent to another,
407 -- it is not nested within another.
408 (_, NodePara)
409 | isAdjacent ->
410 case n of
411 -- NOTE: no HeaderSection (even adjacent) within a NodePara
412 NodeHeader HeaderSection{} -> collapse2
413 _ -> collapse
414 | otherwise -> collapse2
415 _ -> collapse
416 where
417 isAdjacent = pos_line bn - pos_line eo <= 1
418 para = Tree (Cell bn en NodePara) (return new) : rows
419 collapse = Tree (Cell bo en o) (os |> new) : olds
420 collapse2 = collapseRoot new $ collapseRoot old olds