]> Git — Sourcephile - doclang.git/blob - Language/DTC/Index.hs
Add multi-words indexing.
[doclang.git] / Language / DTC / Index.hs
1 {-# LANGUAGE DuplicateRecordFields #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE FlexibleContexts #-}
4 {-# LANGUAGE MultiParamTypeClasses #-}
5 {-# LANGUAGE OverloadedStrings #-}
6 {-# LANGUAGE ScopedTypeVariables #-}
7 {-# LANGUAGE TypeApplications #-}
8 {-# LANGUAGE ViewPatterns #-}
9 {-# OPTIONS_GHC -fno-warn-orphans #-}
10 -- | Compute an Index for a DTC.
11 module Language.DTC.Index where
12
13 import Control.Applicative (Applicative(..))
14 import Control.Monad (Monad(..), mapM, forM)
15 import Data.Bool
16 import Data.Char (Char)
17 import Data.Default.Class (Default(..))
18 import Data.Foldable (Foldable(..), concat)
19 import Data.Function (($), (.), const)
20 import Data.Functor ((<$>))
21 import Data.Map.Strict (Map)
22 import Data.Maybe (Maybe(..), maybe)
23 import Data.Monoid (Monoid(..))
24 import Data.Semigroup (Semigroup(..))
25 import Data.Sequence (Seq)
26 import Data.Text (Text)
27 import Data.TreeMap.Strict (TreeMap(..))
28 import Data.TreeSeq.Strict (Tree(..))
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Control.Monad.Trans.State as S
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map.Strict as Map
35 import qualified Data.Strict.Maybe as Strict
36 import qualified Data.Text as Text
37 import qualified Data.TreeMap.Strict as TreeMap
38
39 import Language.DTC.Document (Count,Words,Terms, Word, WordOrSpace(..), Words)
40 import Language.XML (XmlPos(..))
41 import qualified Language.DTC.Document as DTC
42
43 -- import Debug.Trace (trace)
44
45 termsByChar :: Terms -> Map Char Terms
46 termsByChar =
47 foldr (\aliases acc ->
48 case aliases of
49 (Word w:_):_ | not (Text.null w) ->
50 Map.insertWith (<>)
51 (Char.toUpper $ Text.index w 0)
52 [aliases] acc
53 _ -> acc
54 ) Map.empty
55
56 -- * Type 'Ref'
57 data Ref
58 = Ref
59 { term :: Words
60 , count :: Count
61 , section :: XmlPos
62 } deriving (Show)
63
64 -- ** Type 'Path'
65 type Path = TreeMap.Path Word
66
67 -- ** Type 'Refs'
68 type Refs = TreeMap Word [Ref]
69
70 refsOfTerms :: Terms -> Refs
71 refsOfTerms = TreeMap.fromList const . (>>= f) . concat
72 where
73 f [] = []
74 f ws = maybe [] (\p -> [(p,[])]) $ pathFromWords ws
75
76 pathFromWords :: Words -> Maybe Path
77 pathFromWords ws =
78 case ws >>= unSpace of
79 p:ps | not (Text.null p) -> Just (TreeMap.path p ps)
80 _ -> Nothing
81 where
82 unSpace = \case
83 Space -> []
84 Word w -> [w]
85
86 -- * Type 'State'
87 data State
88 = State
89 { state_refs :: Refs
90 , state_text :: Bool
91 , state_section :: XmlPos
92 }
93 state :: State
94 state = State
95 { state_refs = mempty
96 , state_text = True
97 , state_section = def
98 }
99
100 -- * Class 'Indexify'
101 class Indexify a where
102 indexify :: a -> S.State State a
103 instance Indexify (Tree k a) => Indexify [Tree k a] where
104 indexify = mapM indexify
105 instance Indexify a => Indexify (Tree DTC.BodyKey a) where
106 indexify = \case
107 Tree0 v -> Tree0 <$> indexify v
108 TreeN k v ->
109 case k of
110 DTC.Section{..} -> do
111 before@State{state_section} <- S.get
112 S.put before{state_section = pos}
113 t <- TreeN <$> indexify k <*> indexify v
114 after <- S.get
115 S.put after{state_section}
116 return t
117 instance Indexify a => Indexify (Seq a) where
118 indexify = mapM indexify
119 instance Indexify DTC.BodyKey where
120 indexify = \case
121 DTC.Section{..} ->
122 DTC.Section pos attrs
123 <$> indexify title
124 <*> pure aliases
125 instance Indexify DTC.BodyValue where
126 indexify = \case
127 d@DTC.ToC{} -> pure d
128 d@DTC.ToF{} -> pure d
129 d@DTC.Index{} -> pure d
130 DTC.Figure{..} ->
131 DTC.Figure pos attrs type_
132 <$> indexify title
133 <*> indexify verts
134 DTC.Vertical v ->
135 DTC.Vertical <$> indexify v
136 instance Indexify [DTC.Reference] where
137 indexify = mapM indexify
138 instance Indexify [DTC.Vertical] where
139 indexify = mapM indexify
140 instance Indexify [[DTC.Vertical]] where
141 indexify = mapM (mapM indexify)
142 instance Indexify DTC.Title where
143 indexify (DTC.Title t) = DTC.Title <$> indexify t
144 instance Indexify DTC.Vertical where
145 indexify = \case
146 DTC.Para{..} -> DTC.Para pos attrs <$> indexify horis
147 DTC.OL{..} -> DTC.OL pos attrs <$> indexify items
148 DTC.UL{..} -> DTC.UL pos attrs <$> indexify items
149 DTC.RL{..} -> DTC.RL pos attrs <$> indexify refs
150 DTC.Artwork{..} -> DTC.Artwork pos attrs <$> indexify art
151 d@DTC.Comment{} -> pure d
152 instance Indexify [DTC.Horizontal] where
153 indexify hs =
154 (concat <$>) $ forM hs $ \case
155 d@DTC.BR -> return [d]
156 DTC.B s -> pure . DTC.B <$> indexify s
157 DTC.Code s -> pure . DTC.Code <$> indexify s
158 DTC.Del s -> pure . DTC.Del <$> indexify s
159 DTC.I s -> pure . DTC.I <$> indexify s
160 DTC.Note s -> pure . DTC.Note <$> indexify s
161 DTC.Q s -> pure . DTC.Q <$> indexify s
162 DTC.SC s -> pure . DTC.SC <$> indexify s
163 DTC.Sub s -> pure . DTC.Sub <$> indexify s
164 DTC.Sup s -> pure . DTC.Sup <$> indexify s
165 DTC.U s -> pure . DTC.U <$> indexify s
166 DTC.Eref{..} -> pure . DTC.Eref href <$> indexify text
167 DTC.Iref{..} -> pure . DTC.Iref count term <$> indexify text
168 DTC.Ref{..} -> pure . DTC.Ref to <$> indexify text
169 DTC.Rref{..} -> pure . DTC.Rref to <$> indexify text
170 DTC.Plain p -> do
171 State{..} <- S.get
172 let (refs,ret) = indexifyWords state_section state_refs (wordify p)
173 S.modify $ \s -> s{state_refs=refs}
174 return ret
175 instance Indexify DTC.Reference where
176 indexify = return
177 instance Indexify DTC.Artwork where
178 indexify = return
179
180 wordify :: Text -> Words
181 wordify = List.reverse . go []
182 where
183 go :: Words -> Text -> Words
184 go acc t =
185 case Text.span Char.isAlphaNum t of
186 ("",_) ->
187 case Text.span Char.isSpace t of
188 ("",_) ->
189 case Text.uncons t of
190 Nothing -> acc
191 Just (c,r) -> go (Word (Text.singleton c) : acc) r
192 (_s,r) -> go (Space : acc) r
193 (w,r) -> go (Word w : acc) r
194
195 plainifyWord :: WordOrSpace -> Text
196 plainifyWord = \case
197 Word w -> w
198 Space -> " "
199
200 plainifyWords :: Words -> Text
201 plainifyWords = Text.concat . (plainifyWord <$>)
202
203 indexifyWords :: XmlPos -> Refs -> Words -> (Refs, [DTC.Horizontal])
204 indexifyWords section = go []
205 where
206 go :: [DTC.Horizontal] -> Refs -> Words -> (Refs, [DTC.Horizontal])
207 go acc refs inp =
208 case inp of
209 [] -> (refs, List.reverse acc)
210 Space : next ->
211 go (DTC.Plain " " : acc) refs next
212 Word w : next ->
213 case goWords [] refs [] inp of
214 Nothing -> go (DTC.Plain w : acc) refs next
215 Just (Ref{term,count}, ls, ns, rs) ->
216 go (DTC.Iref {term, count, text = DTC.Plain . plainifyWord <$> List.reverse ls} : acc) rs ns
217 goWords ::
218 Words -> Refs ->
219 Words -> Words ->
220 Maybe (Ref, Words, Words, Refs)
221 goWords ws m@(TreeMap refsByWord) prev inp =
222 case inp of
223 [] -> Nothing
224 curr@Space : next -> goWords (curr:ws) m (curr:prev) next
225 curr@(Word w) : next ->
226 let words = curr:ws in
227 case Map.lookup w refsByWord of
228 Nothing -> Nothing
229 Just nod@TreeMap.Node{..} ->
230 case node_value of
231 Strict.Nothing ->
232 if null node_descendants
233 then Nothing
234 else case goWords words node_descendants (curr:prev) next of
235 Nothing -> Nothing
236 Just (ref, ls, ns, rs) ->
237 Just (ref, ls, ns, TreeMap $ Map.insert w nod{TreeMap.node_descendants = rs} refsByWord)
238 Strict.Just refs ->
239 case goWords words node_descendants (curr:prev) next of
240 Nothing ->
241 let term = List.reverse words in
242 let count = case refs of [] -> 1; Ref{count=c}:_ -> c + 1 in
243 let ref = Ref{term, count, section} in
244 Just (ref, curr:prev, next, TreeMap $ Map.insert w nod{TreeMap.node_value = Strict.Just $ ref:refs} refsByWord)
245 Just (ref, ls, ns, rs) ->
246 Just (ref, ls, ns, TreeMap $ Map.insert w nod{TreeMap.node_descendants = rs} refsByWord)