1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE PatternSynonyms #-}
3 {-# LANGUAGE ViewPatterns #-}
9 import Control.Monad (Monad(..))
11 import Data.Char (Char)
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..), any)
14 import Data.Function (($))
15 import Data.Functor ((<$>))
17 import Data.List.NonEmpty (NonEmpty(..))
18 import Data.Monoid (Monoid(..))
19 import Data.Ord (Ordering(..), Ord(..))
20 import Data.Semigroup (Semigroup(..))
21 import Data.Sequence ((|>))
22 import Data.TreeSeq.Strict (Tree(..), Trees)
23 import Prelude (undefined, Num(..))
24 import System.FilePath (FilePath)
25 import Text.Show (Show(..))
26 import qualified Data.List as List
27 import qualified Data.Sequence as Seq
28 import qualified Data.Text.Lazy as TL
36 -- | A single 'Tree' to gather all the 'Node's
37 -- simplifies greatly the navigation and transformations,
38 -- especially because the later XML or DTC output
39 -- are themselves a single tree-like data structure.
41 -- Also, having a single 'Tree' is easier to merge
42 -- XML coming from the first parsing phase (eg. @('NodeHeader' ('HeaderEqual' "li" ""))@),
43 -- and XML coming from the second parsing phase (eg. @NodePair (PairElem "li" [])@).
45 -- For error reporting, indentation sensitivity and paragraph grouping,
46 -- each 'Node' is annotated with a 'Cell'
47 -- spanning over all its content (sub-'Trees' included).
48 type Root = Tree (Cell Node)
49 type Roots = Trees (Cell Node)
51 pattern Tree0 :: a -> Tree a
52 pattern Tree0 a <- Tree a (null -> True)
53 where Tree0 a = Tree a mempty
57 = NodeHeader !Header -- ^ node, from first parsing (indentation-sensitive)
58 | NodeText !TL.Text -- ^ leaf verbatim text, from first parsing (indentation-sensitive)
59 | NodePair !Pair -- ^ node, from second parsing (on some 'NodeText's)
60 | NodeToken !Token -- ^ leaf, from second parsing (on some 'NodeText's)
61 | NodeLower !Name !ElemAttrs -- ^ node, @<name a=b@
62 | NodePara -- ^ node, gather trees by paragraph,
63 -- useful to know when to generate a <para> XML node
69 = HeaderColon !Name !White -- ^ @name: @
70 | HeaderEqual !Name !White -- ^ @name=@
71 | HeaderBar !Name !White -- ^ @name|@
72 | HeaderGreat !Name !White -- ^ @name>@
73 | HeaderBrackets !Name -- ^ @[name]@
74 | HeaderDot !Name -- ^ @1. @
75 | HeaderDash -- ^ @- @
76 | HeaderDashDash -- ^ @-- @
77 | HeaderSection !LevelSection -- ^ @# @
78 | HeaderDotSlash !FilePath -- ^ @./file @
79 deriving (Eq, Ord, Show)
80 instance Pretty Header
85 -- ** Type 'LevelSection'
86 type LevelSection = Int
90 = PairElem !ElemName !ElemAttrs -- ^ @\<name a0=v0 a1=v1>text\</name>@
91 | PairTag Bool -- ^ @\#text\#@ or @~\#text\#@
92 | PairAt Bool -- ^ @\@text\@@ or @~\@text\@@
93 | PairStar -- ^ @*text*@
94 | PairSlash -- ^ @/text/@
95 | PairUnderscore -- ^ @_value_@
96 | PairDash -- ^ @-text-@
97 | PairBackquote -- ^ @`text`@
98 | PairSinglequote -- ^ @'text'@
99 | PairDoublequote -- ^ @"text"@
100 | PairFrenchquote -- ^ @«text»@
101 | PairParen -- ^ @(text)@
102 | PairBrace -- ^ @{text}@
103 | PairBracket -- ^ @[text]@
104 deriving (Eq,Ord,Show)
110 | TokenEscape !Char -- ^ @\\char@
112 | TokenAt !Bool !Ref -- ^ @\@foo@ or @~\@foo@
113 | TokenTag !Bool !Ref -- ^ @\#foo@ or @~\#foo@
119 = FilterAnd !Filter !Filter
120 | FilterOr !Filter !Filter
124 deriving (Eq,Ord,Show)
134 -- | In normal order: a list of 'Header's, maybe ended by 'Value', all read on the same line.
138 -- | In reverse order: a list of nodes in scope
139 -- (hence to which the next line can append to).
142 -- | Having an initial 'Root' simplifies 'mergeRowIndent':
143 -- one can always put the last 'Root' as a child to a previous one.
144 -- This 'Root' just has to be discarded by 'collapseRows'.
146 initRows = [Tree0 $ Cell (Span "" p p :| []) $ NodeHeader HeaderDash]
147 where p = Pos{pos_line= -1, pos_column=0}
148 -- NOTE: such that any following 'Root'
149 -- is 'NodePara' if possible, and always a child.
151 -- | @mergeRow rows row@ append @row@ into @rows@, while merging what has to be.
153 -- * [@rows@] is old 'Rows', its |Root|s' 'cell_begin' are descending (non-strictly),
154 -- they MAY span over multilines, and they can be many from a single line.
155 -- * [@row@] is new 'Row', its |Root|s' 'cell_begin' are descending (non-strictly),
156 -- they MUST span only over a single and entire line.
158 -- This is the main entry point to build 'Rows' by accumulating 'Row' into them.
159 mergeRow :: Rows -> Row -> Rows
161 debug2_ "mergeRow" ("news",List.reverse row) ("olds",rows) $
162 mergeRowPrefix 0 rows $ List.reverse row
164 -- | Merge by considering matching prefixes.
166 -- 'HeaderGreat' and 'HeaderBar' work, not on indentation,
167 -- but on their vertical alignment as prefixes.
168 -- Hence, each new 'Row' has those prefixes zipped into a single one
169 -- when they match, are aligned and adjacent.
170 mergeRowPrefix :: ColNum -> Rows -> Row -> Rows
171 mergeRowPrefix col rows row =
172 debug3_ "mergeRowPrefix" ("col",col) ("news",row) ("olds",rows) $
175 (_, []) -> undefined -- NOTE: cannot happen with initRows
176 ( _new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news
177 , _old@(Tree (Cell (Span _fo _bo eo:|_so) _o) _os):_olds ) ->
178 case collapseRowsWhile isCollapsable rows of
179 [] -> mergeRowIndent rows row
180 head@(unTree -> ch@(Cell (Span _fh bh _eh:|_sh) h)) : olds' ->
182 -- NOTE: zipping: when new is HeaderGreat, collapse last line downto col
183 -- then check if there is a matching HeaderGreat,
184 -- if so, discard new and restart with a col advanced to new's beginning
185 (NodeHeader HeaderGreat{}, NodeHeader HeaderGreat{})
186 | isAdjacent && isMatching ch -> discard
187 -- NOTE: same for HeaderBar
188 (NodeHeader HeaderBar{}, NodeHeader HeaderBar{})
189 | isAdjacent && isMatching ch -> discard
190 -- NOTE: collapsing: any other new aligned or on the right of an adjacent head
191 -- makes it collapse entirely
192 (_, NodeHeader HeaderGreat{})
193 | col < pos_column bh -> collapse
194 -- NOTE: same for HeaderBar
195 (_, NodeHeader HeaderBar{})
196 | col < pos_column bh -> collapse
197 _ -> debug "mergeRowPrefix/indent" $ mergeRowIndent rows row
199 isAdjacent = pos_line bn - pos_line eo <= 1
200 discard = debug "mergeRowPrefix/discard" $ mergeRowPrefix (pos_column bh) rows news
201 collapse = debug "mergeRowPrefix/collapse" $ mergeRowPrefix col (collapseRoot head olds') row
203 isMatching (Cell (Span _fh bh _eh:|_sh) h) =
204 pos_column bn == pos_column bh &&
206 isCollapsable = debug2 "mergeRowPrefix/isCollapsable" "new" "old" $
207 \_t0@(unTree -> c0@(Cell (Span _f0 b0 _e0:|_s0) _n0))
208 _t1@(unTree -> Cell (Span _f1 b1 e1:|_s1) _n1) ->
209 not (isMatching c0) &&
210 (pos_line b0 - pos_line e1 <= 1) && -- adjacent
211 col < pos_column b1 -- righter than col
213 -- | Merge by considering indentation.
214 mergeRowIndent :: Rows -> Row -> Rows
215 mergeRowIndent rows row =
216 debug2_ "mergeRowIndent" ("news",row) ("olds",rows) $
219 (_, []) -> undefined -- NOTE: cannot happen with initRows
220 ( new@(Tree (Cell ssn@(Span fn bn en:|sn) n) ns):news
221 ,old@(Tree (Cell sso@(Span fo bo eo:|so) o) os):olds ) ->
222 case debug0 "mergeRowIndent/colNew" (pos_column bn) `compare`
223 debug0 "mergeRowIndent/colOld" (pos_column bo) of
224 -- NOTE: new is on the left
227 -- NOTE: merge adjacent NodeText
230 (NodeText tn, NodeText to)
231 | TL.null tn || TL.null to
232 , not isVerbatim -> collapse
233 | isAdjacent && isIndented -> merge $ Tree t (os<>ns)
235 t = NodeText <$> Cell (Span fo boNew eo:|so) (indent<>to) <> Cell ssn tn
236 boNew = bo{pos_column=pos_column bn}
237 indent = TL.replicate (int64 $ pos_column bo - pos_column bn) " "
238 -- | Whether the horizontal delta is made of spaces
240 debug0 "mergeRowIndent/isIndented" $
243 (unTree -> (cell_location -> (span_end -> ep) :| _)) : _ ->
244 case pos_line ep `compare` pos_line bo of
246 EQ -> pos_column ep <= pos_column bn
249 -- NOTE: new is vertically aligned
252 -- NOTE: preserve all NodeText "", but still split into two NodePara
253 (NodeText tn, NodeText to)
254 | TL.null tn || TL.null to
255 , not isVerbatim -> collapse
256 | isAdjacent -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
257 -- NOTE: HeaderSection can parent Nodes at the same level
258 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
259 if debug0 "mergeRowIndent/lvlNew" lvlNew
260 > debug0 "mergeRowIndent/lvlOld" lvlOld
267 -- NOTE: old is no HeaderSection, then collapse to any older and loop
268 (NodeHeader HeaderSection{}, _)
269 | rows'@(sec:_) <- collapseRowsWhile isCollapsable rows
270 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
271 mergeRowIndent rows' row
273 isCollapsable = debug2 "mergeRowIndent/isCollapsable" "new" "old" $
274 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_ss0) n0) _t1 ->
276 NodeHeader HeaderSection{} -> False
277 _ -> pos_column bn == pos_column b0
278 -- NOTE: in case of alignment, HeaderSection is parent
279 (_, NodeHeader HeaderSection{}) -> concat
282 -- NOTE: new is on the right
285 -- NOTE: keep NodeText "" out of old NodePara
286 (NodeText "", NodePara) -> collapse
287 -- NOTE: merge adjacent NodeText
288 (NodeText tn, NodeText to) ->
290 _ | TL.null tn || TL.null to
291 , not isVerbatim -> collapse
294 True -> merge $ Tree (NodeText <$> Cell sso to <> Cell ssn tn) (os<>ns)
298 False -> mergeRowIndent (collapseRoot old olds) (shifted:news)
300 shifted = Tree (Cell (Span fn bnNew en:|sn) $ NodeText $ indent<>tn) (os<>ns)
301 bnNew = bn{pos_column=pos_column bo}
302 indent = TL.replicate (int64 $ pos_column bn - pos_column bo) " "
306 isAdjacent = pos_line bn - pos_line eo <= 1
307 -- | Whether a parent semantic want new to stay a NodeText
308 isVerbatim = any p rows
310 p (unTree -> (unCell -> NodeHeader HeaderBar{})) = True
312 concat = debug "mergeRowIndent/concat" $ List.reverse row <> rows
313 merge m = debug "mergeRowIndent/merge" $ mergeRowIndent (m : olds) news
314 collapse = debug "mergeRowIndent/collapse" $ mergeRowIndent (collapseRoot old olds) row
315 replace = debug "mergeRowIndent/replace" $ mergeRowIndent (new : collapseRoot old olds) news
317 -- | Like 'mergeRowIndent', but without maintaining the appending,
318 -- hence collapsing all the 'Root's of the given 'Rows'.
320 -- NOTE: 'initRows' MUST have been the first 'Rows'
321 -- before calling 'mergeRowIndent' on it to get the given 'Rows'.
322 collapseRows :: Rows -> Roots
324 debug1_ "collapseRows" ("rows",rows) $
325 case collapseRowsWhile (\_new _old -> True) rows of
328 -- NOTE: subTrees returns the children of the updated initRows
330 -- | Collapse downto any last HeaderSection, returning it and its level.
331 collapseSection :: ColNum -> Rows -> Rows
332 collapseSection col = debug1 "collapseSection" "rows" go
334 go rows@(new@(unTree -> Cell (Span _fn bn _en:|_sn) n):olds)
335 | col <= pos_column bn =
337 NodeHeader HeaderSection{} -> rows
338 _ -> collapseSection col $ collapseRoot new $ go olds
341 collapseRowsWhile :: (Root -> Root -> Bool) -> Rows -> Rows
342 collapseRowsWhile test = debug1 "collapseRowsWhile" "rows" $ \case
344 rows@(new@(Tree (Cell (Span _fn bn _en:|_sn) n) _ns):news) ->
347 old@(Tree (Cell (Span _fo bo eo:|_so) o) _os):olds
348 | not $ test new old -> rows
350 case debug0 "collapseRowsWhile/colNew" (pos_column bn) `compare`
351 debug0 "collapseRowsWhile/colOld" (pos_column bo) of
352 -- NOTE: new is vertically aligned
355 -- NOTE: HeaderSection can parent Nodes at the same level
356 (NodeHeader (HeaderSection lvlNew), NodeHeader (HeaderSection lvlOld)) ->
357 if debug0 "collapseRowsWhile/lvlNew" lvlNew
358 > debug0 "collapseRowsWhile/lvlOld" lvlOld
365 debug "collapseRowsWhile/replace" $
366 collapseRowsWhile test $ (new:) $ collapseRoot old olds
367 -- NOTE: old is no HeaderSection, then collapse to any older and loop
368 (NodeHeader HeaderSection{}, _)
369 | news'@(sec:_) <- debug0 "collapseRowsWhile/section" $ collapseRowsWhile isCollapsable news
370 , (unTree -> (unCell -> NodeHeader HeaderSection{})) <- sec ->
371 collapseRowsWhile test news'
373 isCollapsable = debug2 "collapseRowsWhile/isCollapsable" "new" "old" $
374 \_t0@(unTree -> Cell (Span _f0 b0 _e0:|_s0) n0) _t1 ->
376 NodeHeader HeaderSection{} -> False
377 _ -> pos_column bn == pos_column b0
378 -- NOTE: in case of alignment, HeaderSection is parent
379 (_, NodeHeader HeaderSection{}) -> debug "collapseRowsWhile/section/parent" collapse
380 -- NOTE: merge within old NodePara.
381 (_, NodePara) | isAdjacent -> collapse
384 -- NOTE: new is either on the left or on the right
387 isAdjacent = pos_line bn - pos_line eo <= 1
388 collapse = debug "collapseRowsWhile/collapse" $ collapseRowsWhile test $ collapseRoot new $ news
389 collapse2 = debug "collapseRowsWhile/collapse2" $ collapseRowsWhile test $ collapseRoot new $ collapseRoot old $ olds
391 -- | Put a 'Root' as a child of the head 'Root'.
393 -- NOTE: 'collapseRoot' is where 'NodePara' may be introduced.
395 -- NOTE: any NodeText/NodeText merging must have been done before.
396 collapseRoot :: Root -> Rows -> Rows
397 collapseRoot new@(Tree (Cell ssn@(Span _fn bn en:|_sn) n) _ns) rows =
398 debug2_ "collapseRoot" ("new",Seq.singleton new) ("rows",rows) $
401 old@(Tree (Cell (Span fo bo eo:|so) o) os) : olds ->
403 -- NOTE: no child into NodeText
404 (_, NodeText{}) -> collapse2
405 -- NOTE: NodeText can begin a NodePara
406 (NodeText tn, _) | not $ TL.null tn ->
408 -- NOTE: no NodePara within those
409 NodeHeader HeaderEqual{} -> collapse
410 NodeHeader HeaderBar{} -> collapse
411 NodeHeader HeaderDashDash{} -> collapse
412 -- NOTE: NodePara within those
413 NodePara | not isAdjacent -> para
416 -- NOTE: amongst remaining nodes, only adjacent ones may enter an old NodePara.
417 -- Note that since a NodePara is never adjacent to another,
418 -- it is not nested within another.
422 -- NOTE: no HeaderSection (even adjacent) within a NodePara
423 NodeHeader HeaderSection{} -> collapse2
425 | otherwise -> collapse2
428 isAdjacent = pos_line bn - pos_line eo <= 1
429 para = Tree (Cell ssn NodePara) (return new) : rows
430 collapse = Tree (Cell (Span fo bo en:|so) o) (os |> new) : olds
431 collapse2 = collapseRoot new $ collapseRoot old olds