]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Multi.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Terms.Multi (multiterms, multiterms_rake)
17 where
18
19 import Data.Text hiding (map, group, filter, concat)
20 import Data.List (concat)
21 import qualified Data.Set as S
22
23 import Gargantext.Prelude
24 import Gargantext.Core (Lang(..))
25 import Gargantext.Core.Types
26
27 import Gargantext.Text.Terms.Multi.PosTagging
28 import Gargantext.Text.Terms.Mono.Stem (stem)
29 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
30 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
31
32 import Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)
33
34 multiterms :: Lang -> Text -> IO [Terms]
35 multiterms lang txt = concat
36 <$> map (map (tokenTag2terms lang))
37 <$> map (filter (\t -> _my_token_pos t == Just NP))
38 <$> tokenTags lang txt
39
40 tokenTag2terms :: Lang -> TokenTag -> Terms
41 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
42 where
43 t' = S.fromList $ map (stem lang) $ S.toList t
44
45 tokenTags :: Lang -> Text -> IO [[TokenTag]]
46 tokenTags lang s = map (group lang) <$> tokenTags' lang s
47
48
49 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
50 tokenTags' lang t = map tokens2tokensTags
51 <$> map _sentenceTokens
52 <$> _sentences
53 <$> corenlp lang t
54
55 ---- | This function analyses and groups (or not) ngrams according to
56 ---- specific grammars of each language.
57 group :: Lang -> [TokenTag] -> [TokenTag]
58 group EN = En.group
59 group FR = Fr.group
60 -- group _ = panic $ pack "group :: Lang not implemeted yet"