1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
9 import Control.Monad (Monad(..))
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
17 import Data.List.NonEmpty (NonEmpty(..))
18 import Data.Monoid (Monoid(..))
19 import Data.Ord (Ordering(..), Ord(..))
20 import Data.Semigroup (Semigroup(..))
21 import Data.Sequence ((|>))
22 import Data.TreeSeq.Strict (Tree(..), Trees)
23 import Prelude (undefined, Num(..))
24 import System.FilePath (FilePath)
25 import Text.Show (Show(..))
26 import qualified Data.List as List
27 import qualified Data.Sequence as Seq
28 import qualified Data.Text.Lazy as TL
36 -- | A single 'Tree' to gather all the 'Node's
37 -- simplifies greatly the navigation and transformations,
38 -- especially because the later XML or DTC output
39 -- are themselves a single tree-like data structure.
41 -- Also, having a single 'Tree' is easier to merge
42 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
43 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
45 -- For error reporting, indentation sensitivity and paragraph grouping,
46 -- each 'Node' is annotated with a 'Cell'
47 -- spanning over all its content (sub-'Trees' included).
48 type Root = Tree (Cell Node)
49 type Roots = Trees (Cell Node)
51 pattern Tree0 :: a -> Tree a
52 pattern Tree0 a <- Tree a (null -> True)
53 where Tree0 a = Tree a mempty
57 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
58 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
59 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
60 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
61 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
62 | NodePara -- ^ node, gather trees by paragraph,
63 -- useful to know when to generate a <para> XML node
69 = HeaderColon !Name !White -- ^ @name: @
70 | HeaderEqual !Name !White -- ^ @name=@
71 | HeaderBar !Name !White -- ^ @name|@
72 | HeaderGreat !Name !White -- ^ @name>@
73 | HeaderBrackets !Name -- ^ @[name]@
74 | HeaderDot !Name -- ^ @1. @
75 | HeaderDash -- ^ @- @
76 | HeaderDashDash -- ^ @-- @
77 | HeaderSection !LevelSection -- ^ @# @
78 | HeaderDotSlash !FilePath -- ^ @./file @
79 deriving (Eq, Ord, Show)
80 instance Pretty Header
85 -- ** Type 'LevelSection'
86 type LevelSection = Int
90 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
91 | PairTag Bool -- ^ @\#text#@ or @^\#text#@
92 | PairAt Bool -- ^ @\@text\@@ or @^\@text\@@
93 | PairStar -- ^ @*text*@
94 | PairSlash -- ^ @/text/@
95 | PairUnderscore -- ^ @_value_@
96 | PairDash -- ^ @-text-@
97 | PairBackquote -- ^ @`text`@
98 | PairSinglequote -- ^ @'text'@
99 | PairDoublequote -- ^ @"text"@
100 | PairFrenchquote -- ^ @«text»@
101 | PairParen -- ^ @(text)@
102 | PairBrace -- ^ @{text}@
103 | PairBracket -- ^ @[text]@
104 deriving (Eq,Ord,Show)
110 | TokenEscape !Char -- ^ @\\char@
112 | TokenAt !Bool !Ref -- ^ @\@foo@ or @^\@foo@
113 | TokenTag !Bool !Ref -- ^ @\#foo@ or @^\#foo@
123 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
127 -- | In reverse order: a list of nodes in scope
128 -- (hence to which the next line can append to).
131 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
132 -- one can always put the last 'Root' as a child to a previous one.
133 -- This 'Root' just has to be discarded by 'collapseRows'.
135 initRows = [Tree0 $ Cell (Span "" p p :| []) $ NodeHeader HeaderDash]
136 where p = Pos{pos_line= -1, pos_column=0}
137 -- NOTE: such that any following 'Root'
138 -- is 'NodePara' if possible, and always a child.
140 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
142 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
143 -- they MAY span over multilines, and they can be many from a single line.
144 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
145 -- they MUST span only over a single and entire line.
147 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
148 mergeRow :: Rows -> Row -> Rows
150 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
151 mergeRowPrefix 0 rows $ List.reverse row
153 -- | Merge by considering matching prefixes.
155 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
156 -- but on their vertical alignment as prefixes.
157 -- Hence, each new 'Row' has those prefixes zipped into a single one
158 -- when they match, are aligned and adjacent.
159 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
160 mergeRowPrefix col rows row =
161 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
164 (_, []) -> undefined -- NOTE: cannot happen with initRows
165 ( _new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news
166 , _old@(Tree (Cell (Span _fo _bo eo:|_so) _o) _os):_olds ) ->
167 case collapseRowsWhile isCollapsable rows of
168 [] -> mergeRowIndent rows row
169 head@(unTree -> ch@(Cell (Span _fh bh _eh:|_sh) h)) : olds' ->
171 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
172 -- then check if there is a matching HeaderGreat,
173 -- if so, discard new and restart with a col advanced to new's beginning
174 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
175 | isAdjacent && isMatching ch -> discard
176 -- NOTE: same for HeaderBar
177 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
178 | isAdjacent && isMatching ch -> discard
179 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
180 -- makes it collapse entirely
181 (_, NodeHeader HeaderGreat{})
182 | col < pos_column bh -> collapse
183 -- NOTE: same for HeaderBar
184 (_, NodeHeader HeaderBar{})
185 | col < pos_column bh -> collapse
186 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
188 isAdjacent = pos_line bn - pos_line eo <= 1
189 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
190 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
192 isMatching (Cell (Span _fh bh _eh:|_sh) h) =
193 pos_column bn == pos_column bh &&
195 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
196 \_t0@(unTree -> c0@(Cell (Span _f0 b0 _e0:|_s0) _n0))
197 _t1@(unTree -> Cell (Span _f1 b1 e1:|_s1) _n1) ->
198 not (isMatching c0) &&
199 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
200 col < pos_column b1 -- righter than col
202 -- | Merge by considering indentation.
203 mergeRowIndent :: Rows -> Row -> Rows
204 mergeRowIndent rows row =
205 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
208 (_, []) -> undefined -- NOTE: cannot happen with initRows
209 ( new@(Tree (Cell ssn@(Span fn bn en:|sn) n) ns):news
210 ,old@(Tree (Cell sso@(Span fo bo eo:|so) o) os):olds ) ->
211 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
212 debug0 "mergeRowIndent/colOld" (pos_column bo) of
213 -- NOTE: new is on the left
216 -- NOTE: merge adjacent NodeText
219 (NodeText tn, NodeText to)
220 | TL.null tn || TL.null to
221 , not isVerbatim -> collapse
222 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
224 t = NodeText <$> Cell (Span fo boNew eo:|so) (indent<>to) <> Cell ssn tn
225 boNew = bo{pos_column=pos_column bn}
226 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
227 -- | Whether the horizontal delta is made of spaces
229 debug0 "mergeRowIndent/isIndented" $
232 (unTree -> (cell_location -> (span_end -> ep) :| _)) : _ ->
233 case pos_line ep `compare` pos_line bo of
235 EQ -> pos_column ep <= pos_column bn
238 -- NOTE: new is vertically aligned
241 -- NOTE: preserve all NodeText "", but still split into two NodePara
242 (NodeText tn, NodeText to)
243 | TL.null tn || TL.null to
244 , not isVerbatim -> collapse
245 | isAdjacent -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
246 -- NOTE: HeaderSection can parent Nodes at the same level
247 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
248 if debug0 "mergeRowIndent/lvlNew" lvlNew
249 > debug0 "mergeRowIndent/lvlOld" lvlOld
256 -- NOTE: old is no HeaderSection, then collapse to any older and loop
257 (NodeHeader HeaderSection{}, _)
258 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
259 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
260 mergeRowIndent rows' row
262 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
263 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_ss0) n0) _t1 ->
265 NodeHeader HeaderSection{} -> False
266 _ -> pos_column bn == pos_column b0
267 -- NOTE: in case of alignment, HeaderSection is parent
268 (_, NodeHeader HeaderSection{}) -> concat
271 -- NOTE: new is on the right
274 -- NOTE: keep NodeText "" out of old NodePara
275 (NodeText "", NodePara) -> collapse
276 -- NOTE: merge adjacent NodeText
277 (NodeText tn, NodeText to) ->
279 _ | TL.null tn || TL.null to
280 , not isVerbatim -> collapse
283 True -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
287 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
289 shifted = Tree (Cell (Span fn bnNew en:|sn) $ NodeText $ indent<>tn) (os<>ns)
290 bnNew = bn{pos_column=pos_column bo}
291 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
295 isAdjacent = pos_line bn - pos_line eo <= 1
296 -- | Whether a parent semantic want new to stay a NodeText
297 isVerbatim = any p rows
299 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
301 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
302 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
303 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
304 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
306 -- | Like 'mergeRowIndent', but without maintaining the appending,
307 -- hence collapsing all the 'Root's of the given 'Rows'.
309 -- NOTE: 'initRows' MUST have been the first 'Rows'
310 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
311 collapseRows :: Rows -> Roots
313 debug1_ "collapseRows" ("rows",rows) $
314 case collapseRowsWhile (\_new _old -> True) rows of
317 -- NOTE: subTrees returns the children of the updated initRows
319 -- | Collapse downto any last HeaderSection, returning it and its level.
320 collapseSection :: ColNum -> Rows -> Rows
321 collapseSection col = debug1 "collapseSection" "rows" go
323 go rows@(new@(unTree -> Cell (Span _fn bn _en:|_sn) n):olds)
324 | col <= pos_column bn =
326 NodeHeader HeaderSection{} -> rows
327 _ -> collapseSection col $ collapseRoot new $ go olds
330 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
331 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
333 rows@(new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news) ->
336 old@(Tree (Cell (Span _fo bo eo:|_so) o) _os):olds
337 | not $ test new old -> rows
339 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
340 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
341 -- NOTE: new is vertically aligned
344 -- NOTE: HeaderSection can parent Nodes at the same level
345 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
346 if debug0 "collapseRowsWhile/lvlNew" lvlNew
347 > debug0 "collapseRowsWhile/lvlOld" lvlOld
354 debug "collapseRowsWhile/replace" $
355 collapseRowsWhile test $ (new:) $ collapseRoot old olds
356 -- NOTE: old is no HeaderSection, then collapse to any older and loop
357 (NodeHeader HeaderSection{}, _)
358 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
359 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
360 collapseRowsWhile test news'
362 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
363 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_s0) n0) _t1 ->
365 NodeHeader HeaderSection{} -> False
366 _ -> pos_column bn == pos_column b0
367 -- NOTE: in case of alignment, HeaderSection is parent
368 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
369 -- NOTE: merge within old NodePara.
370 (_, NodePara) | isAdjacent -> collapse
373 -- NOTE: new is either on the left or on the right
376 isAdjacent = pos_line bn - pos_line eo <= 1
377 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
378 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
380 -- | Put a 'Root' as a child of the head 'Root'.
382 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
384 -- NOTE: any NodeText/NodeText merging must have been done before.
385 collapseRoot :: Root -> Rows -> Rows
386 collapseRoot new@(Tree (Cell ssn@(Span _fn bn en:|_sn) n) _ns) rows =
387 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
390 old@(Tree (Cell (Span fo bo eo:|so) o) os) : olds ->
392 -- NOTE: no child into NodeText
393 (_, NodeText{}) -> collapse2
394 -- NOTE: NodeText can begin a NodePara
395 (NodeText tn, _) | not $ TL.null tn ->
397 -- NOTE: no NodePara within those
398 NodeHeader HeaderEqual{} -> collapse
399 NodeHeader HeaderBar{} -> collapse
400 NodeHeader HeaderDashDash{} -> collapse
401 -- NOTE: NodePara within those
402 NodePara | not isAdjacent -> para
405 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
406 -- Note that since a NodePara is never adjacent to another,
407 -- it is not nested within another.
411 -- NOTE: no HeaderSection (even adjacent) within a NodePara
412 NodeHeader HeaderSection{} -> collapse2
414 | otherwise -> collapse2
417 isAdjacent = pos_line bn - pos_line eo <= 1
418 para = Tree (Cell ssn NodePara) (return new) : rows
419 collapse = Tree (Cell (Span fo bo en:|so) o) (os |> new) : olds
420 collapse2 = collapseRoot new $ collapseRoot old olds