]> Git — Sourcephile - doclang.git/blob - src/Textphile/DTC/Analyze/Index.hs
Rename {hdoc => textphile}
[doclang.git] / src / Textphile / DTC / Analyze / Index.hs
1 {-# LANGUAGE FlexibleInstances #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 module Textphile.DTC.Analyze.Index where
4
5 import Control.Applicative (Applicative(..))
6 import Control.Monad (Monad(..))
7 import Data.Bool
8 import Data.Char (Char)
9 import Data.Default.Class (Default(..))
10 import Data.Foldable (Foldable(..), concat)
11 import Data.Function (($), (.), const)
12 import Data.Functor ((<$>), (<$))
13 import Data.Functor.Compose (Compose(..))
14 import Data.Map.Strict (Map)
15 import Data.Maybe (Maybe(..), maybe)
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence ((|>))
19 import Data.Traversable (Traversable(..))
20 import Data.TreeMap.Strict (TreeMap(..))
21 import Data.TreeSeq.Strict (Tree(..), tree0)
22 import qualified Data.HashMap.Strict as HM
23 -- import qualified Control.Monad.Trans.RWS.Strict as RWS
24 import qualified Control.Monad.Trans.Reader as R
25 import qualified Control.Monad.Trans.State as S
26 import qualified Data.Char as Char
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Sequence as Seq
30 import qualified Data.Strict.Maybe as Strict
31 import qualified Data.Text.Lazy as TL
32 import qualified Data.TreeMap.Strict as TM
33 import qualified Data.TreeSeq.Strict as TS
34
35 import Textphile.DTC.Document as DTC
36 import qualified Textphile.XML as XML
37
38 {-
39 -- * Type 'Index'
40 data Index = Index
41 { index_terms :: TM.TreeMap Word (Seq (Either Head Section))
42 , index_tag :: HS.HashSet Ident
43 , index_at :: HS.HashSet Ident
44 } deriving (Eq,Show)
45 instance Default Index where
46 def = Index
47 { index_terms = def
48 , index_tag = def
49 , index_at = def
50 }
51 -}
52
53 {-
54 data Indexable = Indexable
55 { indexable_terms :: Terms
56 , indexable_tags :: HS.HashSet Ident
57 }
58 -}
59
60 -- | Build an 'Index' a whole 'Document', returning a mangled version of it
61 -- where 'PlainIref's are inserted as required by the given 'Terms'.
62 indexifyDocument :: Terms -> Document -> (Document, Index)
63 indexifyDocument terms doc =
64 let index = indexOfTerms terms in
65 if null terms then (doc, def)
66 else
67 (`S.runState` index) $
68 (`R.runReader` def) $
69 getCompose $
70 indexify doc
71
72 indexOfTerms :: Terms -> Index
73 indexOfTerms = TM.fromList const . (>>= f) . concat
74 where
75 f [] = []
76 f ws = maybe [] (\p -> [(p, Seq.empty)]) $ pathFromWords ws
77
78 -- * Type 'Reader'
79 newtype Reader = Reader
80 { reader_section :: Section
81 }
82 instance Default Reader where
83 def = Reader
84 { reader_section = def
85 }
86
87 -- * Type 'State'
88 type State = Index
89
90 -- * Class 'Indexify'
91 class Indexify a where
92 indexify :: a -> Compose (R.Reader Reader) (S.State State) a
93 instance Indexify Document where
94 indexify Document{..} =
95 Document
96 <$> traverse indexify document_head
97 <*> traverse indexify document_body
98 instance Indexify Head where
99 indexify Head{..} =
100 Head
101 <$> indexify head_section
102 <*> traverse indexify head_body
103 instance Indexify (Tree BodyNode) where
104 indexify (Tree n ts) =
105 case n of
106 BodyBlock b ->
107 Tree . BodyBlock
108 <$> indexify b
109 <*> traverse indexify ts
110 BodySection section@Section{..} ->
111 Compose $ R.local (\ro -> ro{reader_section = section}) $
112 getCompose $
113 Tree . BodySection
114 <$> indexify section
115 <*> traverse indexify ts
116 instance Indexify Section where
117 indexify Section{..} =
118 Section section_posXML section_locTCT section_attrs
119 <$> indexify section_about
120 instance Indexify Block where
121 indexify b = case b of
122 BlockPara p -> BlockPara <$> indexify p
123 BlockBreak{} -> pure b
124 BlockToC{} -> pure b
125 BlockToF{} -> pure b
126 BlockAside{..} -> BlockAside posXML attrs <$> traverse indexify blocks
127 BlockIndex{..} -> pure b
128 BlockFigure{..} ->
129 BlockFigure posXML type_ attrs
130 <$> traverse indexify mayTitle
131 <*> traverse indexify paras
132 BlockReferences{..} ->
133 BlockReferences posXML attrs
134 <$> traverse indexify refs
135 BlockJudges js -> BlockJudges <$> indexify js
136 BlockGrades{..} ->
137 BlockGrades posXML attrs
138 <$> indexify scale
139 instance Indexify Para where
140 indexify = \case
141 ParaItem{..} -> ParaItem <$> indexify item
142 ParaItems{..} -> ParaItems posXML attrs <$> traverse indexify items
143 instance Indexify ParaItem where
144 indexify = \case
145 ParaPlain plain -> ParaPlain <$> indexify plain
146 ParaOL items -> ParaOL <$> traverse indexify items
147 ParaUL items -> ParaUL <$> traverse (traverse indexify) items
148 ParaQuote{..} -> ParaQuote type_ <$> traverse indexify paras
149 p@ParaArtwork{} -> pure p
150 p@ParaComment{} -> pure p
151 ParaJudgment j -> ParaJudgment <$> indexify j
152 instance Indexify ListItem where
153 indexify ListItem{..} = ListItem name <$> traverse indexify paras
154 instance Indexify Plain where
155 indexify = traverse indexify
156 instance Indexify (Tree PlainNode) where
157 indexify (Tree n ts) =
158 case n of
159 PlainIref{..} ->
160 case pathFromWords iref_term of
161 Nothing -> Tree n <$> traverse indexify ts
162 Just words ->
163 Compose $ R.ask >>= \ro -> getCompose $
164 Tree n
165 <$ Compose (pure (S.modify' $ \state_irefsByWords ->
166 TM.insert (<>) words (pure $ reader_section ro) $
167 state_irefsByWords
168 ))
169 <*> traverse indexify ts
170 PlainText txt ->
171 -- TODO: introduce PlainGroup only when necessary?
172 Tree PlainGroup <$> indexifyWords (wordify txt)
173 _ -> Tree n <$> traverse indexify ts
174 instance Indexify Title where
175 indexify (Title p) = Title <$> indexify p
176 instance Indexify About where
177 indexify About{..} =
178 About
179 <$> traverse indexify about_titles
180 <*> pure about_aliases
181 <*> traverse indexify about_authors
182 <*> traverse indexify about_dates
183 <*> pure about_tags
184 <*> pure about_links
185 <*> pure about_series
186 <*> traverse indexify about_description
187 <*> traverse indexify about_judgments
188 instance Indexify Entity where
189 indexify = pure -- TODO: to be coded
190 instance Indexify Date where
191 indexify = pure -- TODO: to be coded
192 instance Indexify Include where
193 indexify = pure -- TODO: to be coded
194 instance Indexify Reference where
195 indexify Reference{..} = do
196 Reference
197 reference_posXML
198 reference_locTCT
199 reference_id
200 <$> indexify reference_about
201 instance Indexify Judges where
202 indexify Judges{..} =
203 Judges
204 judges_locTCT
205 judges_posXML
206 judges_attrs
207 <$> traverse (traverse indexify) judges_byName
208 instance Indexify [Grade] where
209 indexify = traverse indexify
210 instance Indexify Judgment where
211 indexify Judgment{..} =
212 Judgment
213 judgment_opinionsByChoice
214 -- judgment_judges
215 -- judgment_grades
216 judgment_posXML
217 judgment_locTCT
218 judgment_judgesId
219 judgment_gradesId
220 judgment_importance
221 judgment_hide
222 <$> traverse indexify judgment_question
223 <*> traverse indexify judgment_choices
224 instance Indexify Choice where
225 indexify Choice{..} =
226 Choice choice_locTCT choice_posXML
227 <$> traverse indexify choice_title
228 <*> traverse indexify choice_opinions
229 instance Indexify Opinion where
230 indexify Opinion{..} =
231 Opinion
232 opinion_locTCT
233 opinion_posXML
234 opinion_judge
235 opinion_grade
236 opinion_default
237 opinion_importance
238 <$> traverse indexify opinion_comment
239 instance Indexify Grade where
240 indexify Grade{..} =
241 Grade
242 grade_posXML
243 grade_name
244 grade_color
245 grade_isDefault
246 <$> traverse indexify grade_title
247 instance Indexify Judge where
248 indexify Judge{..} =
249 Judge
250 judge_locTCT
251 judge_posXML
252 judge_name
253 <$> traverse indexify judge_title
254 <*> pure judge_defaultGrades
255
256 -- * Type 'Words'
257 wordify :: TL.Text -> Words
258 wordify = List.reverse . go []
259 where
260 go :: Words -> TL.Text -> Words
261 go acc t =
262 case TL.span Char.isAlphaNum t of
263 ("",_) ->
264 case TL.span Char.isSpace t of
265 ("",_) ->
266 case TL.uncons t of
267 Nothing -> acc
268 Just (c,r) -> go (Word (TL.singleton c) : acc) r
269 (_s,r) -> go (Space : acc) r
270 (w,r) -> go (Word w : acc) r
271
272 plainifyWord :: WordOrSpace -> TL.Text
273 plainifyWord = \case
274 Word w -> w
275 Space -> " "
276
277 plainifyWords :: Words -> TL.Text
278 plainifyWords = TL.concat . (plainifyWord <$>)
279
280 indexifyWords :: Words -> Compose (R.Reader Reader) (S.State Index) Plain
281 indexifyWords ws =
282 Compose $ R.ask >>= \case
283 ro -> pure $ go mempty ws
284 where
285 go :: Plain -> Words -> S.State Index Plain
286 go acc inp =
287 case inp of
288 [] -> return acc
289 Space : next ->
290 go (acc |> tree0 (PlainText " ")) next
291 Word w : next ->
292 goWords [] inp >>= \case
293 Nothing -> go (acc |> tree0 (PlainText w)) next
294 Just (ls, ns) -> do
295 let iref_term = List.reverse ls
296 let lines = Seq.fromList $ tree0 . PlainText . plainifyWord <$> iref_term
297 go (acc |> Tree PlainIref{iref_term} lines) ns
298 goWords ::
299 Words -> Words ->
300 S.State Index (Maybe (Words, Words))
301 goWords prev inp = do
302 TM.TreeMap irefsByWord <- S.get
303 case inp of
304 [] -> return Nothing
305 curr@Space : next -> goWords (curr:prev) next
306 curr@(Word w) : next ->
307 case Map.lookup w irefsByWord of
308 Nothing -> return Nothing
309 Just nod@TM.Node{..} -> do
310 let prev' = curr:prev
311 S.put node_descendants
312 case node_value of
313 Strict.Nothing
314 | null node_descendants -> return Nothing
315 | otherwise -> goWords prev' next
316 <* S.modify' (\rs -> TreeMap $ Map.insert w nod{TM.node_descendants = rs} irefsByWord)
317 Strict.Just irefs ->
318 goWords prev' next >>= \case
319 Nothing -> do
320 S.put $ TM.TreeMap $
321 Map.insert w
322 nod{TM.node_value = Strict.Just $ reader_section ro Seq.<|irefs}
323 irefsByWord
324 return $ Just (prev', next)
325 r@Just{} -> do
326 S.modify' $ \rs ->
327 TM.TreeMap $ Map.insert w nod{TM.node_descendants = rs} irefsByWord
328 return r
329
330 termsByChar :: Terms -> Map Char Terms
331 termsByChar =
332 foldr (\aliases acc ->
333 case aliases of
334 (Word w:_):_ | not (TL.null w) ->
335 Map.insertWith (<>)
336 (Char.toUpper $ TL.index w 0)
337 [aliases] acc
338 _ -> acc
339 ) Map.empty
340
341 -- * Type 'IndexByPos'
342 type IndexByPos = HM.HashMap XML.Pos Terms
343
344 -- ** Class 'CollectIndex'
345 class CollectIndex a where
346 collectIndex :: a -> IndexByPos
347 instance CollectIndex Document where
348 collectIndex Document{..} =
349 collectIndex document_body
350 instance CollectIndex (TS.Trees BodyNode) where
351 collectIndex = foldMap $ \(TS.Tree b bs) ->
352 case b of
353 BodyBlock blk -> collectIndex blk
354 BodySection _ -> collectIndex bs
355 instance CollectIndex Block where
356 collectIndex = \case
357 BlockIndex{..} -> HM.singleton posXML index
358 _ -> def