]> Git — Sourcephile - doclang.git/blob - Language/TCT/Tree.hs
Remove NodeGroup, as it can break parsing based on Seq.spanl.
[doclang.git] / Language / TCT / Tree.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
4 module Language.TCT.Tree
5 ( module Language.TCT.Tree
6 , Tree(..), Trees
7 ) where
8
9 import Control.Monad (Monad(..))
10 import Data.Bool
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
16 import Data.Int (Int)
17 import Data.Monoid (Monoid(..))
18 import Data.Ord (Ordering(..), Ord(..))
19 import Data.Semigroup (Semigroup(..))
20 import Data.Sequence ((|>))
21 import Data.TreeSeq.Strict (Tree(..), Trees)
22 import Prelude (undefined, Num(..))
23 import System.FilePath (FilePath)
24 import Text.Show (Show(..))
25 import qualified Data.List as List
26 import qualified Data.Sequence as Seq
27 import qualified Data.Text.Lazy as TL
28
29 import Language.TCT.Utils
30 import Language.TCT.Cell
31 import Language.TCT.Elem
32 import Language.TCT.Debug
33
34 -- * Type 'Root'
35 -- | A single 'Tree' to rule all the 'Node's
36 -- simplifies greatly the navigation and transformations,
37 -- especially because the later XML or DTC output
38 -- are themselves a single tree-like data structure.
39 --
40 -- Also, having a single 'Tree' is easier to merge
41 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
42 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
43 --
44 -- For error reporting, each 'Node' is annotated with a 'Cell'
45 -- spanning over all its content (sub-'Trees' included).
46 type Root = Tree (Cell Node)
47 type Roots = Trees (Cell Node)
48
49 pattern Tree0 :: a -> Tree a
50 pattern Tree0 a <- Tree a (null -> True)
51 where Tree0 a = Tree a mempty
52
53 -- * Type 'Node'
54 data Node
55 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
56 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
57 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
58 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
59 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
60 | NodePara -- ^ node, gather trees by paragraph,
61 -- useful to know when to generate a <para> XML node
62 deriving (Eq,Show)
63 instance Pretty Node
64
65 -- * Type 'Header'
66 data Header
67 = HeaderColon !Name !White -- ^ @name: @
68 | HeaderEqual !Name !White -- ^ @name=@
69 | HeaderBar !Name !White -- ^ @name|@
70 | HeaderGreat !Name !White -- ^ @name>@
71 | HeaderBrackets !Name -- ^ @[name]@
72 | HeaderDot !Name -- ^ @1. @
73 | HeaderDash -- ^ @- @
74 | HeaderDashDash -- ^ @-- @
75 | HeaderSection !LevelSection -- ^ @# @
76 | HeaderDotSlash !FilePath -- ^ @./file @
77 deriving (Eq, Ord, Show)
78 instance Pretty Header
79
80 -- ** Type 'Name'
81 type Name = TL.Text
82
83 -- ** Type 'LevelSection'
84 type LevelSection = Int
85
86 -- * Type 'Pair'
87 data Pair
88 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
89 | PairHash -- ^ @\#text#@
90 | PairStar -- ^ @*text*@
91 | PairSlash -- ^ @/text/@
92 | PairUnderscore -- ^ @_value_@
93 | PairDash -- ^ @-text-@
94 | PairBackquote -- ^ @`text`@
95 | PairSinglequote -- ^ @'text'@
96 | PairDoublequote -- ^ @"text"@
97 | PairFrenchquote -- ^ @«text»@
98 | PairParen -- ^ @(text)@
99 | PairBrace -- ^ @{text}@
100 | PairBracket -- ^ @[text]@
101 deriving (Eq,Ord,Show)
102 instance Pretty Pair
103
104 -- * Type 'Token'
105 data Token
106 = TokenText !TL.Text
107 | TokenEscape !Char
108 | TokenLink !Link
109 | TokenTag !Tag
110 deriving (Eq,Show)
111
112 -- ** Type 'Tag'
113 type Tag = TL.Text
114
115 -- ** Type 'Link'
116 type Link = TL.Text
117
118 -- * Type 'Row'
119 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
120 type Row = [Root]
121
122 -- ** Type 'Rows'
123 -- | In reverse order: a list of nodes in scope
124 -- (hence to which the next line can append to).
125 type Rows = [Root]
126
127 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
128 -- one can always put the last 'Root' as a child to a previous one.
129 -- This 'Root' just has to be discarded by 'collapseRows'.
130 initRows :: Rows
131 initRows = [Tree0 $ Cell p p $ NodeHeader HeaderDash]
132 where p = pos1{pos_line= -1, pos_column=0}
133 -- NOTE: such that any following 'Root'
134 -- is 'NodePara' if possible, and always a child.
135
136 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
137 --
138 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
139 -- they MAY span over multilines, and they can be many from a single line.
140 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
141 -- they MUST span only over a single and entire line.
142 --
143 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
144 mergeRow :: Rows -> Row -> Rows
145 mergeRow rows row =
146 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
147 mergeRowPrefix 0 rows $ List.reverse row
148
149 -- | Merge by considering matching prefixes.
150 --
151 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
152 -- but on their vertical alignment as prefixes.
153 -- Hence, each new 'Row' has those prefixes zipped into a single one
154 -- when they match, are aligned and adjacent.
155 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
156 mergeRowPrefix col rows row =
157 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
158 case (row,rows) of
159 ([], _) -> rows
160 (_, []) -> undefined -- NOTE: cannot happen with initRows
161 ( _new@(Tree (Cell bn _en n) _ns):news
162 , _old@(Tree (Cell _bo eo _o) _os):_olds ) ->
163 case collapseRowsWhile isCollapsable rows of
164 [] -> mergeRowIndent rows row
165 head@(unTree -> ch@(Cell bh _eh h)) : olds' ->
166 case (n,h) of
167 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
168 -- then check if there is a matching HeaderGreat,
169 -- if so, discard new and restart with a col advanced to new's beginning
170 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
171 | isAdjacent && isMatching ch -> discard
172 -- NOTE: same for HeaderBar
173 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
174 | isAdjacent && isMatching ch -> discard
175 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
176 -- makes it collapse entirely
177 (_, NodeHeader HeaderGreat{})
178 | col < pos_column bh -> collapse
179 -- NOTE: same for HeaderBar
180 (_, NodeHeader HeaderBar{})
181 | col < pos_column bh -> collapse
182 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
183 where
184 isAdjacent = pos_line bn - pos_line eo <= 1
185 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
186 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
187 where
188 isMatching (Cell bh _eh h) =
189 pos_column bn == pos_column bh &&
190 n == h
191 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
192 \_t0@(unTree -> c0@(Cell b0 _e0 _n0)) _t1@(unTree -> Cell b1 e1 _n1) ->
193 not (isMatching c0) &&
194 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
195 col < pos_column b1 -- righter than col
196
197 -- | Merge by considering indentation.
198 mergeRowIndent :: Rows -> Row -> Rows
199 mergeRowIndent rows row =
200 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
201 case (row,rows) of
202 ([], _) -> rows
203 (_, []) -> undefined -- NOTE: cannot happen with initRows
204 ( new@(Tree (Cell bn en n) ns):news
205 ,old@(Tree (Cell bo eo o) os):olds ) ->
206 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
207 debug0 "mergeRowIndent/colOld" (pos_column bo) of
208 -- NOTE: new is on the left
209 LT ->
210 case (n,o) of
211 -- NOTE: merge adjacent NodeText
212 -- first
213 -- second
214 (NodeText tn, NodeText to)
215 | TL.null tn || TL.null to
216 , not isVerbatim -> collapse
217 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
218 where
219 t = NodeText <$> Cell boNew eo (indent<>to) <> Cell bn en tn
220 boNew = bo{pos_column=pos_column bn}
221 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
222 -- | Whether the horizontal delta is made of spaces
223 isIndented =
224 debug0 "mergeRowIndent/isIndented" $
225 case olds of
226 [] -> True
227 (unTree -> (cell_end -> ep)) : _ ->
228 case pos_line ep `compare` pos_line bo of
229 LT -> True
230 EQ -> pos_column ep <= pos_column bn
231 _ -> False
232 _ -> collapse
233 -- NOTE: new is vertically aligned
234 EQ ->
235 case (n,o) of
236 -- NOTE: preserve all NodeText "", but still split into two NodePara
237 (NodeText tn, NodeText to)
238 | TL.null tn || TL.null to
239 , not isVerbatim -> collapse
240 | isAdjacent -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
241 -- NOTE: HeaderSection can parent Nodes at the same level
242 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
243 if debug0 "mergeRowIndent/lvlNew" lvlNew
244 > debug0 "mergeRowIndent/lvlOld" lvlOld
245 -- # old
246 -- ## new
247 then concat
248 -- ## old or # old
249 -- # new # new
250 else collapse
251 -- NOTE: old is no HeaderSection, then collapse to any older and loop
252 (NodeHeader HeaderSection{}, _)
253 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
254 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
255 mergeRowIndent rows' row
256 where
257 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
258 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
259 case n0 of
260 NodeHeader HeaderSection{} -> False
261 _ -> pos_column bn == pos_column b0
262 -- NOTE: in case of alignment, HeaderSection is parent
263 (_, NodeHeader HeaderSection{}) -> concat
264 --
265 _ -> replace
266 -- NOTE: new is on the right
267 GT ->
268 case (n,o) of
269 -- NOTE: only same line Root can be pushed on HeaderBar
270 -- DELME: (_, NodeHeader HeaderBar{}) | pos_column bn /= pos_column eo -> collapse
271 -- NOTE: keep NodeText "" out of old NodePara
272 (NodeText "", NodePara) -> collapse
273 -- NOTE: merge adjacent NodeText
274 (NodeText tn, NodeText to) ->
275 case isAdjacent of
276 _ | TL.null tn || TL.null to
277 , not isVerbatim -> collapse
278 -- old
279 -- new
280 True -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
281 -- old
282 --
283 -- new
284 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
285 where
286 shifted = Tree (Cell bnNew en $ NodeText $ indent<>tn) (os<>ns)
287 bnNew = bn{pos_column=pos_column bo}
288 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
289 --
290 _ -> concat
291 where
292 isAdjacent = pos_line bn - pos_line eo <= 1
293 -- | Whether a parent semantic want new to stay a NodeText
294 isVerbatim = any p rows
295 where
296 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
297 p _ = False
298 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
299 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
300 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
301 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
302
303 -- | Like 'mergeRowIndent', but without maintaining the appending,
304 -- hence collapsing all the 'Root's of the given 'Rows'.
305 --
306 -- NOTE: 'initRows' MUST have been the first 'Rows'
307 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
308 collapseRows :: Rows -> Roots
309 collapseRows rows =
310 debug1_ "collapseRows" ("rows",rows) $
311 case collapseRowsWhile (\_new _old -> True) rows of
312 [t] -> subTrees t
313 _ -> undefined
314 -- NOTE: subTrees returns the children of the updated initRows
315
316 -- | Collapse downto any last HeaderSection, returning it and its level.
317 collapseSection :: ColNum -> Rows -> Rows
318 collapseSection col = debug1 "collapseSection" "rows" go
319 where
320 go rows@(new@(unTree -> Cell bn _en n):olds)
321 | col <= pos_column bn =
322 case n of
323 NodeHeader HeaderSection{} -> rows
324 _ -> collapseSection col $ collapseRoot new $ go olds
325 go _ = mempty
326
327 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
328 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
329 [] -> mempty
330 rows@(new@(Tree (Cell bn _en n) _ns):news) ->
331 case news of
332 [] -> rows
333 old@(Tree (Cell bo eo o) _os):olds
334 | not $ test new old -> rows
335 | otherwise ->
336 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
337 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
338 -- NOTE: new is vertically aligned
339 EQ ->
340 case (n,o) of
341 -- NOTE: HeaderSection can parent Nodes at the same level
342 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
343 if debug0 "collapseRowsWhile/lvlNew" lvlNew
344 > debug0 "collapseRowsWhile/lvlOld" lvlOld
345 -- # old
346 -- ## new
347 then collapse
348 -- ## old or # old
349 -- # new # new
350 else
351 debug "collapseRowsWhile/replace" $
352 collapseRowsWhile test $ (new:) $ collapseRoot old olds
353 -- NOTE: old is no HeaderSection, then collapse to any older and loop
354 (NodeHeader HeaderSection{}, _)
355 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
356 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
357 collapseRowsWhile test news'
358 where
359 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
360 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
361 case n0 of
362 NodeHeader HeaderSection{} -> False
363 _ -> pos_column bn == pos_column b0
364 -- NOTE: in case of alignment, HeaderSection is parent
365 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
366 -- NOTE: merge within old NodePara.
367 (_, NodePara) | isAdjacent -> collapse
368 --
369 _ -> collapse2
370 -- NOTE: new is either on the left or on the right
371 _ -> collapse
372 where
373 isAdjacent = pos_line bn - pos_line eo <= 1
374 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
375 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
376
377 -- | Put a 'Root' as a child of the head 'Root'.
378 --
379 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
380 --
381 -- NOTE: any NodeText/NodeText merging must have been done before.
382 collapseRoot :: Root -> Rows -> Rows
383 collapseRoot new@(Tree (Cell bn en n) _ns) rows =
384 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
385 case rows of
386 [] -> return new
387 old@(Tree (Cell bo eo o) os) : olds ->
388 case (n,o) of
389 -- NOTE: no child into NodeText
390 (_, NodeText{}) -> collapse2
391 -- NOTE: NodeText can begin a NodePara
392 (NodeText tn, _) | not $ TL.null tn ->
393 case o of
394 -- NOTE: no NodePara within those
395 NodeHeader HeaderEqual{} -> collapse
396 NodeHeader HeaderBar{} -> collapse
397 NodeHeader HeaderDashDash{} -> collapse
398 -- NOTE: NodePara within those
399 NodePara | not isAdjacent -> para
400 NodeHeader{} -> para
401 _ -> collapse
402 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
403 -- Note that since a NodePara is never adjacent to another,
404 -- it is not nested within another.
405 (_, NodePara)
406 | isAdjacent ->
407 case n of
408 -- NOTE: no HeaderSection (even adjacent) within a NodePara
409 NodeHeader HeaderSection{} -> collapse2
410 _ -> collapse
411 | otherwise -> collapse2
412 _ -> collapse
413 where
414 isAdjacent = pos_line bn - pos_line eo <= 1
415 para = Tree (Cell bn en NodePara) (return new) : rows
416 collapse = Tree (Cell bo en o) (os |> new) : olds
417 collapse2 = collapseRoot new $ collapseRoot old olds