{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} module Textphile.DTC.Analyze.Index where import Control.Applicative (Applicative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Default.Class (Default(..)) import Data.Foldable (Foldable(..), concat) import Data.Function (($), (.), const) import Data.Functor ((<$>), (<$)) import Data.Functor.Compose (Compose(..)) import Data.Map.Strict (Map) import Data.Maybe (Maybe(..), maybe) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence ((|>)) import Data.Traversable (Traversable(..)) import Data.TreeMap.Strict (TreeMap(..)) import Data.TreeSeq.Strict (Tree(..), tree0) import qualified Data.HashMap.Strict as HM -- import qualified Control.Monad.Trans.RWS.Strict as RWS import qualified Control.Monad.Trans.Reader as R import qualified Control.Monad.Trans.State as S import qualified Data.Char as Char import qualified Data.List as List import qualified Data.Map.Strict as Map import qualified Data.Sequence as Seq import qualified Data.Strict.Maybe as Strict import qualified Data.Text.Lazy as TL import qualified Data.TreeMap.Strict as TM import qualified Data.TreeSeq.Strict as TS import Textphile.DTC.Document as DTC import qualified Textphile.XML as XML {- -- * Type 'Index' data Index = Index { index_terms :: TM.TreeMap Word (Seq (Either Head Section)) , index_tag :: HS.HashSet Ident , index_at :: HS.HashSet Ident } deriving (Eq,Show) instance Default Index where def = Index { index_terms = def , index_tag = def , index_at = def } -} {- data Indexable = Indexable { indexable_terms :: Terms , indexable_tags :: HS.HashSet Ident } -} -- | Build an 'Index' a whole 'Document', returning a mangled version of it -- where 'PlainIref's are inserted as required by the given 'Terms'. indexifyDocument :: Terms -> Document -> (Document, Index) indexifyDocument terms doc = let index = indexOfTerms terms in if null terms then (doc, def) else (`S.runState` index) $ (`R.runReader` def) $ getCompose $ indexify doc indexOfTerms :: Terms -> Index indexOfTerms = TM.fromList const . (>>= f) . concat where f [] = [] f ws = maybe [] (\p -> [(p, Seq.empty)]) $ pathFromWords ws -- * Type 'Reader' newtype Reader = Reader { reader_section :: Section } instance Default Reader where def = Reader { reader_section = def } -- * Type 'State' type State = Index -- * Class 'Indexify' class Indexify a where indexify :: a -> Compose (R.Reader Reader) (S.State State) a instance Indexify Document where indexify Document{..} = Document <$> traverse indexify document_head <*> traverse indexify document_body instance Indexify Head where indexify Head{..} = Head <$> indexify head_section <*> traverse indexify head_body instance Indexify (Tree BodyNode) where indexify (Tree n ts) = case n of BodyBlock b -> Tree . BodyBlock <$> indexify b <*> traverse indexify ts BodySection section@Section{..} -> Compose $ R.local (\ro -> ro{reader_section = section}) $ getCompose $ Tree . BodySection <$> indexify section <*> traverse indexify ts instance Indexify Section where indexify Section{..} = Section section_posXML section_locTCT section_attrs <$> indexify section_about instance Indexify Block where indexify b = case b of BlockPara p -> BlockPara <$> indexify p BlockBreak{} -> pure b BlockToC{} -> pure b BlockToF{} -> pure b BlockAside{..} -> BlockAside posXML attrs <$> traverse indexify blocks BlockIndex{..} -> pure b BlockFigure{..} -> BlockFigure posXML type_ attrs <$> traverse indexify mayTitle <*> traverse indexify paras BlockReferences{..} -> BlockReferences posXML attrs <$> traverse indexify refs BlockJudges js -> BlockJudges <$> indexify js BlockGrades{..} -> BlockGrades posXML attrs <$> indexify scale instance Indexify Para where indexify = \case ParaItem{..} -> ParaItem <$> indexify item ParaItems{..} -> ParaItems posXML attrs <$> traverse indexify items instance Indexify ParaItem where indexify = \case ParaPlain plain -> ParaPlain <$> indexify plain ParaOL items -> ParaOL <$> traverse indexify items ParaUL items -> ParaUL <$> traverse (traverse indexify) items ParaQuote{..} -> ParaQuote type_ <$> traverse indexify paras p@ParaArtwork{} -> pure p p@ParaComment{} -> pure p ParaJudgment j -> ParaJudgment <$> indexify j instance Indexify ListItem where indexify ListItem{..} = ListItem name <$> traverse indexify paras instance Indexify Plain where indexify = traverse indexify instance Indexify (Tree PlainNode) where indexify (Tree n ts) = case n of PlainIref{..} -> case pathFromWords iref_term of Nothing -> Tree n <$> traverse indexify ts Just words -> Compose $ R.ask >>= \ro -> getCompose $ Tree n <$ Compose (pure (S.modify' $ \state_irefsByWords -> TM.insert (<>) words (pure $ reader_section ro) $ state_irefsByWords )) <*> traverse indexify ts PlainText txt -> -- TODO: introduce PlainGroup only when necessary? Tree PlainGroup <$> indexifyWords (wordify txt) _ -> Tree n <$> traverse indexify ts instance Indexify Title where indexify (Title p) = Title <$> indexify p instance Indexify About where indexify About{..} = About <$> traverse indexify about_titles <*> pure about_aliases <*> traverse indexify about_authors <*> traverse indexify about_dates <*> pure about_tags <*> pure about_links <*> pure about_series <*> traverse indexify about_description <*> traverse indexify about_judgments instance Indexify Entity where indexify = pure -- TODO: to be coded instance Indexify Date where indexify = pure -- TODO: to be coded instance Indexify Include where indexify = pure -- TODO: to be coded instance Indexify Reference where indexify Reference{..} = do Reference reference_posXML reference_locTCT reference_id <$> indexify reference_about instance Indexify Judges where indexify Judges{..} = Judges judges_locTCT judges_posXML judges_attrs <$> traverse (traverse indexify) judges_byName instance Indexify [Grade] where indexify = traverse indexify instance Indexify Judgment where indexify Judgment{..} = Judgment judgment_opinionsByChoice -- judgment_judges -- judgment_grades judgment_posXML judgment_locTCT judgment_judgesId judgment_gradesId judgment_importance judgment_hide <$> traverse indexify judgment_question <*> traverse indexify judgment_choices instance Indexify Choice where indexify Choice{..} = Choice choice_locTCT choice_posXML <$> traverse indexify choice_title <*> traverse indexify choice_opinions instance Indexify Opinion where indexify Opinion{..} = Opinion opinion_locTCT opinion_posXML opinion_judge opinion_grade opinion_default opinion_importance <$> traverse indexify opinion_comment instance Indexify Grade where indexify Grade{..} = Grade grade_posXML grade_name grade_color grade_isDefault <$> traverse indexify grade_title instance Indexify Judge where indexify Judge{..} = Judge judge_locTCT judge_posXML judge_name <$> traverse indexify judge_title <*> pure judge_defaultGrades -- * Type 'Words' wordify :: TL.Text -> Words wordify = List.reverse . go [] where go :: Words -> TL.Text -> Words go acc t = case TL.span Char.isAlphaNum t of ("",_) -> case TL.span Char.isSpace t of ("",_) -> case TL.uncons t of Nothing -> acc Just (c,r) -> go (Word (TL.singleton c) : acc) r (_s,r) -> go (Space : acc) r (w,r) -> go (Word w : acc) r plainifyWord :: WordOrSpace -> TL.Text plainifyWord = \case Word w -> w Space -> " " plainifyWords :: Words -> TL.Text plainifyWords = TL.concat . (plainifyWord <$>) indexifyWords :: Words -> Compose (R.Reader Reader) (S.State Index) Plain indexifyWords ws = Compose $ R.ask >>= \case ro -> pure $ go mempty ws where go :: Plain -> Words -> S.State Index Plain go acc inp = case inp of [] -> return acc Space : next -> go (acc |> tree0 (PlainText " ")) next Word w : next -> goWords [] inp >>= \case Nothing -> go (acc |> tree0 (PlainText w)) next Just (ls, ns) -> do let iref_term = List.reverse ls let lines = Seq.fromList $ tree0 . PlainText . plainifyWord <$> iref_term go (acc |> Tree PlainIref{iref_term} lines) ns goWords :: Words -> Words -> S.State Index (Maybe (Words, Words)) goWords prev inp = do TM.TreeMap irefsByWord <- S.get case inp of [] -> return Nothing curr@Space : next -> goWords (curr:prev) next curr@(Word w) : next -> case Map.lookup w irefsByWord of Nothing -> return Nothing Just nod@TM.Node{..} -> do let prev' = curr:prev S.put node_descendants case node_value of Strict.Nothing | null node_descendants -> return Nothing | otherwise -> goWords prev' next <* S.modify' (\rs -> TreeMap $ Map.insert w nod{TM.node_descendants = rs} irefsByWord) Strict.Just irefs -> goWords prev' next >>= \case Nothing -> do S.put $ TM.TreeMap $ Map.insert w nod{TM.node_value = Strict.Just $ reader_section ro Seq.<|irefs} irefsByWord return $ Just (prev', next) r@Just{} -> do S.modify' $ \rs -> TM.TreeMap $ Map.insert w nod{TM.node_descendants = rs} irefsByWord return r termsByChar :: Terms -> Map Char Terms termsByChar = foldr (\aliases acc -> case aliases of (Word w:_):_ | not (TL.null w) -> Map.insertWith (<>) (Char.toUpper $ TL.index w 0) [aliases] acc _ -> acc ) Map.empty -- * Type 'IndexByPos' type IndexByPos = HM.HashMap XML.Pos Terms -- ** Class 'CollectIndex' class CollectIndex a where collectIndex :: a -> IndexByPos instance CollectIndex Document where collectIndex Document{..} = collectIndex document_body instance CollectIndex (TS.Trees BodyNode) where collectIndex = foldMap $ \(TS.Tree b bs) -> case b of BodyBlock blk -> collectIndex blk BodySection _ -> collectIndex bs instance CollectIndex Block where collectIndex = \case BlockIndex{..} -> HM.singleton posXML index _ -> def