]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Parser.hs
[CLEAN] Graph: unoptmized distances using Data.Matrix (conditional and
[gargantext.git] / src / Gargantext / Ngrams / Parser.hs
1 {-|
2 Module : Gargantext.Ngrams.Parser
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE ScopedTypeVariables #-}
17
18 module Gargantext.Ngrams.Parser where
19
20 import Gargantext.Prelude
21 import Gargantext.Ngrams.CoreNLP
22 import Data.Text hiding (map)
23
24 import Gargantext.Types.Main (Language(..))
25 import qualified Gargantext.Ngrams.Lang.En as En
26 import qualified Gargantext.Ngrams.Lang.Fr as Fr
27
28 type SNgrams = (Text, Text, Text)
29
30 -- | Ngrams selection algorithms
31 -- A form is a list of characters seperated by one or more spaces in a sentence.
32 -- A word is a form.
33
34 -- type Form = [Char]
35 -- For performance reasons, Type Text is used, then:
36 -- type Form = Text
37
38
39 -- Let be a form and its associated forms in contexts of a sentence.
40 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
41 -- as a monogram whos occurrences are
42
43 -- ps : Common words function in Haskell do not take sentence into account
44
45
46 -- TODO for scientific papers: add maesures
47 -- TODO add the p score regex
48 extractNgrams :: Language -> Text -> IO [[SNgrams]]
49 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
50
51
52 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
53 extractNgrams' lang t = map (map token2text)
54 <$> map _sentenceTokens
55 <$> _sentences
56 <$> corenlp lang t
57
58 -- | This function selects ngrams according to grammars specific
59 -- of each language.
60 -- In english, JJ is ADJectiv in french.
61 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
62 selectNgrams EN = En.selectNgrams
63 selectNgrams FR = Fr.selectNgrams
64
65 -- | This function analyze and groups (or not) ngrams according to
66 -- grammars specific of each language.
67 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
68 groupNgrams EN = En.groupNgrams
69 groupNgrams FR = Fr.groupNgrams
70