{-| Module : Gargantext.Core.Text Description : Ngrams tools Copyright : (c) CNRS, 2018 License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX Text gathers terms in unit of contexts. -} module Gargantext.Core.Text where import Data.Text (Text, split) import Gargantext.Prelude hiding (filter) import NLP.FullStop (segment) import qualified Data.Text as DT ----------------------------------------------------------------- class HasText h where hasText :: h -> [Text] ----------------------------------------------------------------- -- French words to distinguish contexts newtype Texte = Texte Text newtype Paragraphe = Paragraphe Text newtype Phrase = Phrase Text newtype MultiTerme = MultiTerme Text newtype Mot = Mot Text newtype Lettre = Lettre Text -- | Type syn seems obvious type Titre = Phrase ----------------------------------------------------------------- instance Show Texte where show (Texte t) = show t instance Show Paragraphe where show (Paragraphe p) = show p instance Show Phrase where show (Phrase p) = show p instance Show MultiTerme where show (MultiTerme mt) = show mt instance Show Mot where show (Mot t) = show t instance Show Lettre where show (Lettre l) = show l ----------------------------------------------------------------- class Collage sup inf where dec :: sup -> [inf] inc :: [inf] -> sup instance Collage Texte Paragraphe where dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t inc = Texte . DT.intercalate "\n" . map (\(Paragraphe t) -> t) instance Collage Paragraphe Phrase where dec (Paragraphe t) = map Phrase $ sentences t inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p) instance Collage Phrase MultiTerme where dec (Phrase t) = map MultiTerme $ DT.words t inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p) instance Collage MultiTerme Mot where dec (MultiTerme mt) = map Mot $ DT.words mt inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m) ------------------------------------------------------------------- -- Contexts of text sentences :: Text -> [Text] sentences txt = map DT.pack $ segment $ DT.unpack txt sentences' :: Text -> [Text] sentences' txt = split isCharStop txt isCharStop :: Char -> Bool isCharStop c = c `elem` ['.','?','!'] unsentences :: [Text] -> Text unsentences txts = DT.intercalate " " txts -- | Ngrams size size :: Text -> Int size t = 1 + DT.count " " t