]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Parser.hs
[SPEC] Garg Database Typed as specification.
[gargantext.git] / src / Data / Gargantext / Ngrams / Parser.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3
4 module Data.Gargantext.Ngrams.Parser where
5
6 import Data.Gargantext.Prelude
7 import Data.Gargantext.Ngrams.CoreNLP
8
9
10 import Data.Gargantext.Types.Main (Language(..), Ngrams)
11 import qualified Data.Gargantext.Ngrams.Lang.En as En
12 import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
13
14
15 -- | Ngrams selection algorithms
16 -- A form is a list of characters seperated by one or more spaces in a sentence.
17 -- A word is a form.
18
19 -- type Form = [Char]
20 -- For performance reasons, Type Text is used, then:
21 -- type Form = Text
22
23
24 -- Let be a form and its associated forms in contexts of a sentence.
25 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
26 -- as a monogram whos occurrences are
27
28 -- ps : Common words function in Haskell do not take sentence into account
29
30
31 -- TODO for scientific papers: add maesures
32 -- TODO add the p score regex
33 extractNgrams :: Language -> String -> IO [[Ngrams]]
34 extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' lang s
35
36
37 extractNgrams' :: Language -> String -> IO [[Ngrams]]
38 extractNgrams' lang t = pm (pm token2text)
39 <$> pm _sentenceTokens
40 <$> sentences
41 <$> corenlp lang t
42
43 -- | This function selects ngrams according to grammars specific
44 -- of each language.
45 -- In english, JJ is ADJectiv in french.
46 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
47 selectNgrams EN = En.selectNgrams
48 selectNgrams FR = Fr.selectNgrams
49
50 -- | This function analyze and groups (or not) ngrams according to
51 -- grammars specific of each language.
52 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
53 groupNgrams EN = En.groupNgrams
54 groupNgrams FR = Fr.groupNgrams
55
56