]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Mono.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Terms / Mono.hs
1 {-|
2 Module : Gargantext.Text.Terms.Mono
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mono-terms are Nterms where n == 1.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
17 where
18
19 import Prelude (String)
20
21 import Data.Text (Text)
22 import qualified Data.Text as T
23
24 import qualified Data.List as L
25 import qualified Data.Set as S
26
27 import Gargantext.Core
28 import Gargantext.Core.Types
29 import Gargantext.Text.Terms.Mono.Stem (stem)
30
31 import Gargantext.Prelude
32 --import Data.Char (isAlphaNum, isSpace)
33
34 -- | TODO remove Num ?
35 --isGram c = isAlphaNum c
36
37 words :: Text -> [Text]
38 words = monoTexts
39
40 -- | Sentence split separators
41 isSep :: Char -> Bool
42 isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
43
44 monoTerms :: Lang -> Text -> [Terms]
45 monoTerms l txt = map (monoText2term l) $ monoTexts txt
46
47 monoTexts :: Text -> [Text]
48 monoTexts = L.concat . monoTextsBySentence
49
50 monoText2term :: Lang -> Text -> Terms
51 monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
52
53 monoTextsBySentence :: Text -> [[Text]]
54 monoTextsBySentence = map T.words
55 . T.split isSep
56 . T.toLower
57
58
59