2 Module : Gargantext.Core.Text
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Text gathers terms in unit of contexts.
15 module Gargantext.Core.Text
18 import Data.Text (Text, split)
19 import Gargantext.Prelude hiding (filter)
20 import NLP.FullStop (segment)
21 import qualified Data.Text as DT
23 -----------------------------------------------------------------
26 hasText :: h -> [Text]
28 -----------------------------------------------------------------
29 -- French words to distinguish contexts
30 newtype Texte = Texte Text
31 newtype Paragraphe = Paragraphe Text
32 newtype Phrase = Phrase Text
33 newtype MultiTerme = MultiTerme Text
34 newtype Mot = Mot Text
35 newtype Lettre = Lettre Text
37 -- | Type syn seems obvious
40 -----------------------------------------------------------------
42 instance Show Texte where
43 show (Texte t) = show t
45 instance Show Paragraphe where
46 show (Paragraphe p) = show p
48 instance Show Phrase where
49 show (Phrase p) = show p
51 instance Show MultiTerme where
52 show (MultiTerme mt) = show mt
54 instance Show Mot where
57 instance Show Lettre where
58 show (Lettre l) = show l
60 -----------------------------------------------------------------
62 class Collage sup inf where
66 instance Collage Texte Paragraphe where
67 dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t
68 inc = Texte . DT.intercalate "\n" . map (\(Paragraphe t) -> t)
70 instance Collage Paragraphe Phrase where
71 dec (Paragraphe t) = map Phrase $ sentences t
72 inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p)
74 instance Collage Phrase MultiTerme where
75 dec (Phrase t) = map MultiTerme $ DT.words t
76 inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p)
78 instance Collage MultiTerme Mot where
79 dec (MultiTerme mt) = map Mot $ DT.words mt
80 inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
82 -------------------------------------------------------------------
84 sentences :: Text -> [Text]
85 sentences txt = map DT.pack $ segment $ DT.unpack txt
87 sentences' :: Text -> [Text]
88 sentences' txt = split isCharStop txt
90 isCharStop :: Char -> Bool
91 isCharStop c = c `elem` ['.','?','!']
93 unsentences :: [Text] -> Text
94 unsentences txts = DT.intercalate " " txts
98 size t = 1 + DT.count " " t