]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text.hs
[FIX] Ngrams List size with candidates
[gargantext.git] / src / Gargantext / Core / Text.hs
1 {-|
2 Module : Gargantext.Core.Text
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Text gathers terms in unit of contexts.
11
12 -}
13
14
15 module Gargantext.Core.Text
16 where
17
18 import Data.Text (Text, split)
19 import Gargantext.Prelude hiding (filter)
20 import NLP.FullStop (segment)
21 import qualified Data.Text as DT
22
23 -----------------------------------------------------------------
24 class HasText h
25 where
26 hasText :: h -> [Text]
27
28 -----------------------------------------------------------------
29 -- French words to distinguish contexts
30 newtype Texte = Texte Text
31 newtype Paragraphe = Paragraphe Text
32 newtype Phrase = Phrase Text
33 newtype MultiTerme = MultiTerme Text
34 newtype Mot = Mot Text
35 newtype Lettre = Lettre Text
36
37 -- | Type syn seems obvious
38 type Titre = Phrase
39
40 -----------------------------------------------------------------
41
42 instance Show Texte where
43 show (Texte t) = show t
44
45 instance Show Paragraphe where
46 show (Paragraphe p) = show p
47
48 instance Show Phrase where
49 show (Phrase p) = show p
50
51 instance Show MultiTerme where
52 show (MultiTerme mt) = show mt
53
54 instance Show Mot where
55 show (Mot t) = show t
56
57 instance Show Lettre where
58 show (Lettre l) = show l
59
60 -----------------------------------------------------------------
61
62 class Collage sup inf where
63 dec :: sup -> [inf]
64 inc :: [inf] -> sup
65
66 instance Collage Texte Paragraphe where
67 dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t
68 inc = Texte . DT.intercalate "\n" . map (\(Paragraphe t) -> t)
69
70 instance Collage Paragraphe Phrase where
71 dec (Paragraphe t) = map Phrase $ sentences t
72 inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p)
73
74 instance Collage Phrase MultiTerme where
75 dec (Phrase t) = map MultiTerme $ DT.words t
76 inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p)
77
78 instance Collage MultiTerme Mot where
79 dec (MultiTerme mt) = map Mot $ DT.words mt
80 inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
81
82 -------------------------------------------------------------------
83 -- Contexts of text
84 sentences :: Text -> [Text]
85 sentences txt = map DT.pack $ segment $ DT.unpack txt
86
87 sentences' :: Text -> [Text]
88 sentences' txt = split isCharStop txt
89
90 isCharStop :: Char -> Bool
91 isCharStop c = c `elem` ['.','?','!']
92
93 unsentences :: [Text] -> Text
94 unsentences txts = DT.intercalate " " txts
95
96 -- | Ngrams size
97 size :: Text -> Int
98 size t = 1 + DT.count " " t
99
100
101