{-# OPTIONS_GHC -Wno-orphans #-} module Phylomemy.Indexation where import Data.Eq (Eq) import Data.Map.Strict qualified as Map import Data.Monoid (Monoid (..)) import Data.Ord (Ord) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Set qualified as Set import Data.Text.Short (ShortText) import Data.Validity (Validity (..), declare, delve, trivialValidation) import Data.Validity.Map () import Data.Validity.Set () import Data.Validity.Time () import GHC.Generics (Generic) import Text.Show (Show) -- | A contiguous sequence of n terms newtype Ngram = Ngram ShortText deriving (Eq, Generic, Ord) deriving stock (Show) instance Validity Ngram where validate = trivialValidation -- | A 'Root' is a set of `Ngram`s conveying the same meaning -- (according to the analyst). data Root = Root { rootLabel :: Ngram , rootSynonyms :: Set.Set Ngram } deriving (Eq, Generic, Ord, Show) instance Validity Root where validate r = mconcat [ delve "rootLabel" (rootLabel r) , declare "The rootLabel is not a member of the rootSynonyms" (Set.notMember (rootLabel r) (rootSynonyms r)) , delve "rootSynonyms" (rootSynonyms r) ] type Roots = Set.Set Root type Foundations = Set.Set Root data Document pos = Document { documentPosition :: pos -- ^ A position could be a date, a section, a page, an IP address, … , documentRoots :: Map.Map Root () -- , documentContent :: a } deriving (Eq, Generic, Show) instance Validity pos => Validity (Document pos) type DocumentByRange range pos = Map.Map range (Seq.Seq (Document pos)) documentsByRange :: Ord range => (pos -> range) -> [Document pos] -> DocumentByRange range pos documentsByRange mapKey docs = Map.fromListWith (<>) [ (mapKey (documentPosition doc), Seq.singleton doc) | doc <- docs ] data Range pos = Range { rangeMin :: pos , rangeMax :: pos -- , periodScales :: [Scale] } deriving (Eq, Generic, Show) instance Validity pos => Validity (Range pos) type Vocabulary = Map.Map Root ()