src/Gargantext/Ngrams/Parser.hs

   1 {-# LANGUAGE OverloadedStrings #-}
   2 {-# LANGUAGE ScopedTypeVariables #-}
   3
   4 module Gargantext.Ngrams.Parser where
   5
   6 import Gargantext.Prelude
   7 import Gargantext.Ngrams.CoreNLP
   8
   9
  10 import Gargantext.Types.Main (Language(..), Ngrams)
  11 import qualified Gargantext.Ngrams.Lang.En as En
  12 import qualified Gargantext.Ngrams.Lang.Fr as Fr
  13
  14
  15 -- | Ngrams selection algorithms
  16 -- A form is a list of characters seperated by one or more spaces in a sentence.
  17 -- A word is a form.
  18
  19 -- type Form = [Char]
  20 -- For performance reasons, Type Text is used, then:
  21 -- type Form = Text
  22
  23
  24 -- Let be a form and its associated forms in contexts of a sentence.
  25 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
  26 -- as a monogram whos occurrences are
  27
  28 -- ps : Common words function in Haskell do not take sentence into account
  29
  30
  31 -- TODO for scientific papers: add maesures
  32 -- TODO add the p score regex
  33 extractNgrams :: Language -> String -> IO [[Ngrams]]
  34 extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' lang s
  35
  36
  37 extractNgrams' :: Language -> String -> IO [[Ngrams]]
  38 extractNgrams' lang t =  pm (pm token2text)
  39                      <$> pm _sentenceTokens
  40                      <$> sentences
  41                      <$> corenlp lang t
  42
  43 -- | This function selects ngrams according to grammars specific
  44 --   of each language.
  45 --   In english, JJ is ADJectiv in french.
  46 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
  47 selectNgrams EN = En.selectNgrams
  48 selectNgrams FR = Fr.selectNgrams
  49
  50 -- | This function analyze and groups (or not) ngrams according to
  51 --   grammars specific of each language.
  52 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
  53 groupNgrams EN = En.groupNgrams
  54 groupNgrams FR = Fr.groupNgrams
  55
  56