]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Mono.hs
Cleaner Jobs API
[gargantext.git] / src / Gargantext / Core / Text / Terms / Mono.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Mono
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mono-terms are Nterms where n == 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
16 where
17
18 import Prelude (String)
19
20 import Data.Text (Text)
21 import qualified Data.Text as T
22
23 import qualified Data.List as L
24 import qualified Data.Set as S
25
26 import Gargantext.Core
27 import Gargantext.Core.Types
28 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
29
30 import Gargantext.Prelude
31 --import Data.Char (isAlphaNum, isSpace)
32
33 -- | TODO remove Num ?
34 --isGram c = isAlphaNum c
35
36 words :: Text -> [Text]
37 words = monoTexts
38
39 -- | Sentence split separators
40 isSep :: Char -> Bool
41 isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
42
43 monoTerms :: Lang -> Text -> [TermsWithCount]
44 monoTerms l txt = map (\t -> (monoText2term l t, 1)) $ monoTexts txt
45
46 monoTexts :: Text -> [Text]
47 monoTexts = L.concat . monoTextsBySentence
48
49 -- | TODO use text2term only
50 monoText2term :: Lang -> Text -> Terms
51 monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
52
53 monoTextsBySentence :: Text -> [[Text]]
54 monoTextsBySentence = map T.words
55 . T.split isSep
56 . T.toLower