1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
4 module Language.TCT.Tree
5 ( module Language.TCT.Tree
9 import Control.Monad (Monad(..))
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
17 import Data.Monoid (Monoid(..))
18 import Data.Ord (Ordering(..), Ord(..))
19 import Data.Semigroup (Semigroup(..))
20 import Data.Sequence ((|>))
21 import Data.TreeSeq.Strict (Tree(..), Trees)
22 import Prelude (undefined, Num(..))
23 import System.FilePath (FilePath)
24 import Text.Show (Show(..))
25 import qualified Data.List as List
26 import qualified Data.Sequence as Seq
27 import qualified Data.Text.Lazy as TL
29 import Language.TCT.Utils
30 import Language.TCT.Cell
31 import Language.TCT.Elem
32 import Language.TCT.Debug
35 -- | A single 'Tree' to rule all the 'Node's
36 -- simplifies greatly the navigation and transformations,
37 -- especially because the later XML or DTC output
38 -- are themselves a single tree-like data structure.
40 -- Also, having a single 'Tree' is easier to merge
41 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
42 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
44 -- For error reporting, each 'Node' is annotated with a 'Cell'
45 -- spanning over all its content (sub-'Trees' included).
46 type Root = Tree (Cell Node)
47 type Roots = Trees (Cell Node)
49 pattern Tree0 :: a -> Tree a
50 pattern Tree0 a <- Tree a (null -> True)
51 where Tree0 a = Tree a mempty
55 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
56 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
57 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
58 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
59 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
60 | NodePara -- ^ node, gather trees by paragraph,
61 -- useful to know when to generate a <para> XML node
62 | NodeGroup -- ^ node, group trees into a single tree,
63 -- useful to return many trees when only one is expected
69 = HeaderColon !Name !White -- ^ @name: @
70 | HeaderEqual !Name !White -- ^ @name=@
71 | HeaderBar !Name !White -- ^ @name|@
72 | HeaderGreat !Name !White -- ^ @name>@
73 | HeaderBrackets !Name -- ^ @[name]@
74 | HeaderDot !Name -- ^ @1. @
75 | HeaderDash -- ^ @- @
76 | HeaderDashDash -- ^ @-- @
77 | HeaderSection !LevelSection -- ^ @# @
78 | HeaderDotSlash !FilePath -- ^ @./file @
79 deriving (Eq, Ord, Show)
80 instance Pretty Header
85 -- ** Type 'LevelSection'
86 type LevelSection = Int
90 = PairElem !ElemName !ElemAttrs -- ^ @<elem n0=v0 n1=v1>value</elem>@
91 | PairHash -- ^ @#value#@
92 | PairStar -- ^ @*value*@
93 | PairSlash -- ^ @/value/@
94 | PairUnderscore -- ^ @_value_@
95 | PairDash -- ^ @-value-@
96 | PairBackquote -- ^ @`value`@
97 | PairSinglequote -- ^ @'value'@
98 | PairDoublequote -- ^ @"value"@
99 | PairFrenchquote -- ^ @«value»@
100 | PairParen -- ^ @(value)@
101 | PairBrace -- ^ @{value}@
102 | PairBracket -- ^ @[value]@
103 deriving (Eq,Ord,Show)
121 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
125 -- | In reverse order: a list of nodes in scope
126 -- (hence to which the next line can append to).
129 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
130 -- one can always put the last 'Root' as a child to a previous one.
131 -- This 'Root' just has to be discarded by 'collapseRows'.
133 initRows = [Tree0 (Cell p p NodeGroup)]
134 where p = pos1{pos_line= -1, pos_column=0}
135 -- NOTE: such that any following 'Root'
136 -- is 'NodePara' if possible, and always a child.
138 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
140 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
141 -- they MAY span over multilines, and they can be many from a single line.
142 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
143 -- they MUST span only over a single and entire line.
145 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
146 mergeRow :: Rows -> Row -> Rows
148 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
149 mergeRowPrefix 0 rows $ List.reverse row
151 -- | Merge by considering matching prefixes.
153 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
154 -- but on their vertical alignment as prefixes.
155 -- Hence, each new 'Row' has those prefixes zipped into a single one
156 -- when they match, are aligned and adjacent.
157 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
158 mergeRowPrefix col rows row =
159 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
162 (_, []) -> undefined -- NOTE: cannot happen with initRows
163 ( _new@(Tree (Cell bn _en n) _ns):news
164 , _old@(Tree (Cell _bo eo _o) _os):_olds ) ->
165 case collapseRowsWhile isCollapsable rows of
166 [] -> mergeRowIndent rows row
167 head@(unTree -> ch@(Cell bh _eh h)) : olds' ->
169 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
170 -- then check if there is a matching HeaderGreat,
171 -- if so, discard new and restart with a col advanced to new's beginning
172 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
173 | isAdjacent && isMatching ch -> discard
175 | pos_column bn == pos_column bh
179 -- NOTE: same for HeaderBar
180 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
181 | isAdjacent && isMatching ch -> discard
182 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
183 -- makes it collapse entirely
184 (_, NodeHeader HeaderGreat{})
185 | col < pos_column bh -> collapse
186 -- NOTE: same for HeaderBar
187 (_, NodeHeader HeaderBar{})
188 | col < pos_column bh -> collapse
189 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
191 isAdjacent = pos_line bn - pos_line eo <= 1
192 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
193 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
195 isMatching (Cell bh _eh h) =
196 pos_column bn == pos_column bh &&
198 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
199 \_t0@(unTree -> c0@(Cell b0 _e0 _n0)) _t1@(unTree -> Cell b1 e1 _n1) ->
200 not (isMatching c0) &&
201 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
202 col < pos_column b1 -- righter than col
204 -- | Merge by considering indentation.
205 mergeRowIndent :: Rows -> Row -> Rows
206 mergeRowIndent rows row =
207 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
210 (_, []) -> undefined -- NOTE: cannot happen with initRows
211 ( new@(Tree (Cell bn en n) ns):news
212 ,old@(Tree (Cell bo eo o) os):olds ) ->
213 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
214 debug0 "mergeRowIndent/colOld" (pos_column bo) of
215 -- NOTE: new is on the left
218 -- NOTE: merge adjacent NodeText
221 (NodeText tn, NodeText to)
222 | TL.null tn || TL.null to
223 , not isVerbatim -> collapse
224 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
226 t = NodeText <$> Cell boNew eo (indent<>to) <> Cell bn en tn
227 boNew = bo{pos_column=pos_column bn}
228 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
229 -- | Whether the horizontal delta is made of spaces
231 debug0 "mergeRowIndent/isIndented" $
234 (unTree -> cell_end -> ep) : _ ->
235 case pos_line ep `compare` pos_line bo of
237 EQ -> pos_column ep <= pos_column bn
240 -- NOTE: new is vertically aligned
243 -- NOTE: preserve all NodeText "", but still split into two NodePara
244 (NodeText tn, NodeText to)
245 | TL.null tn || TL.null to
246 , not isVerbatim -> collapse
247 | isAdjacent -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
248 -- NOTE: HeaderSection can parent Nodes at the same level
249 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
250 if debug0 "mergeRowIndent/lvlNew" lvlNew
251 > debug0 "mergeRowIndent/lvlOld" lvlOld
258 -- NOTE: old is no HeaderSection, then collapse to any older and loop
259 (NodeHeader HeaderSection{}, _)
260 -- | rows'@(sec:olds') <- collapseSection (pos_column bn) rows
261 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
262 , (unTree -> unCell -> NodeHeader HeaderSection{}) <- sec ->
263 mergeRowIndent rows' row
265 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
266 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
268 NodeHeader HeaderSection{} -> False
269 _ -> pos_column bn == pos_column b0
270 -- NOTE: in case of alignment, HeaderSection is parent
271 (_, NodeHeader HeaderSection{}) -> concat
274 -- NOTE: new is on the right
277 -- NOTE: only same line Root can be pushed on HeaderBar
278 -- DELME: (_, NodeHeader HeaderBar{}) | pos_column bn /= pos_column eo -> collapse
279 -- NOTE: keep NodeText "" out of old NodePara
280 (NodeText "", NodePara) -> collapse
281 -- NOTE: merge adjacent NodeText
282 (NodeText tn, NodeText to) ->
284 _ | TL.null tn || TL.null to
285 , not isVerbatim -> collapse
288 True -> merge $ Tree (NodeText <$> Cell bo eo to <> Cell bn en tn) (os<>ns)
292 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
294 shifted = Tree (Cell bnNew en $ NodeText $ indent<>tn) (os<>ns)
295 bnNew = bn{pos_column=pos_column bo}
296 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
300 isAdjacent = pos_line bn - pos_line eo <= 1
301 -- | Whether a parent semantic want new to stay a NodeText
302 isVerbatim = any p rows
304 p (unTree -> unCell -> NodeHeader HeaderBar{}) = True
306 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
307 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
308 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
309 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
311 -- | Like 'mergeRowIndent', but without maintaining the appending,
312 -- hence collapsing all the 'Root's of the given 'Rows'.
314 -- NOTE: 'initRows' MUST have been the first 'Rows'
315 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
316 collapseRows :: Rows -> Roots
318 debug1_ "collapseRows" ("rows",rows) $
319 case collapseRowsWhile (\_new _old -> True) rows of
322 -- NOTE: subTrees returns the children of the updated initRows
324 -- | Collapse downto any last HeaderSection, returning it and its level.
325 collapseSection :: ColNum -> Rows -> Rows
326 collapseSection col = debug1 "collapseSection" "rows" go
328 go rows@(new@(unTree -> Cell bn _en n):olds)
329 | col <= pos_column bn =
331 NodeHeader HeaderSection{} -> rows
332 _ -> collapseSection col $ collapseRoot new $ go olds
335 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
336 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
338 rows@(new@(Tree (Cell bn _en n) _ns):news) ->
341 old@(Tree (Cell bo eo o) _os):olds
342 | not $ test new old -> rows
344 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
345 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
346 -- NOTE: new is vertically aligned
349 -- NOTE: HeaderSection can parent Nodes at the same level
350 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
351 if debug0 "collapseRowsWhile/lvlNew" lvlNew
352 > debug0 "collapseRowsWhile/lvlOld" lvlOld
359 debug "collapseRowsWhile/replace" $
360 collapseRowsWhile test $ (new:) $ collapseRoot old olds
361 -- NOTE: old is no HeaderSection, then collapse to any older and loop
362 (NodeHeader HeaderSection{}, _)
363 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
364 , (unTree -> unCell -> NodeHeader HeaderSection{}) <- sec ->
365 collapseRowsWhile test news'
367 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
368 \_t0@(unTree -> Cell b0 _e0 n0) _t1 ->
370 NodeHeader HeaderSection{} -> False
371 _ -> pos_column bn == pos_column b0
372 -- NOTE: in case of alignment, HeaderSection is parent
373 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
374 -- NOTE: merge within old NodePara.
375 (_, NodePara) | isAdjacent -> collapse
378 -- NOTE: new is either on the left or on the right
381 isAdjacent = pos_line bn - pos_line eo <= 1
382 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
383 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
385 -- | Put a 'Root' as a child of the head 'Root'.
387 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
388 -- NOTE: any NodeText/NodeText merging must have been done before.
389 collapseRoot :: Root -> Rows -> Rows
390 collapseRoot new@(Tree (Cell bn en n) _ns) rows =
391 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
394 old@(Tree (Cell bo eo o) os) : olds ->
396 -- NOTE: no child into NodeText
397 (_, NodeText{}) -> collapse2
398 -- NOTE: NodeText can begin a NodePara
399 (NodeText tn, _) | not $ TL.null tn ->
401 -- NOTE: no NodePara within those
402 NodeHeader HeaderEqual{} -> collapse
403 NodeHeader HeaderBar{} -> collapse
404 NodeHeader HeaderDashDash{} -> collapse
405 -- NOTE: NodePara within those
406 NodePara | not isAdjacent -> para
410 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
411 -- Note that since a NodePara is never adjacent to another,
412 -- it is not nested within another.
416 -- NOTE: no HeaderSection (even adjacent) within a NodePara
417 NodeHeader HeaderSection{} -> collapse2
419 | otherwise -> collapse2
422 isAdjacent = pos_line bn - pos_line eo <= 1
423 para = Tree (Cell bn en NodePara) (return new) : rows
424 collapse = Tree (Cell bo en o) (os |> new) : olds
425 collapse2 = collapseRoot new $ collapseRoot old olds