src/Gargantext/Ngrams/Parser.hs

   1 {-|
   2 Module      : Gargantext.Ngrams.Parser
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Here is a longer description of this module, containing some
  11 commentary with @some markup@.
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude   #-}
  15 {-# LANGUAGE OverloadedStrings   #-}
  16 {-# LANGUAGE ScopedTypeVariables #-}
  17
  18 module Gargantext.Ngrams.Parser where
  19
  20 import Gargantext.Prelude
  21 import Gargantext.Ngrams.CoreNLP
  22 import Data.Text hiding (map)
  23
  24 import Gargantext.Types.Main (Language(..))
  25 import qualified Gargantext.Ngrams.Lang.En as En
  26 import qualified Gargantext.Ngrams.Lang.Fr as Fr
  27
  28 type SNgrams       = (Text, Text, Text)
  29
  30 -- | Ngrams selection algorithms
  31 -- A form is a list of characters seperated by one or more spaces in a sentence.
  32 -- A word is a form.
  33
  34 -- type Form = [Char]
  35 -- For performance reasons, Type Text is used, then:
  36 -- type Form = Text
  37
  38
  39 -- Let be a form and its associated forms in contexts of a sentence.
  40 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
  41 -- as a monogram whos occurrences are
  42
  43 -- ps : Common words function in Haskell do not take sentence into account
  44
  45
  46 -- TODO for scientific papers: add maesures
  47 -- TODO add the p score regex
  48 extractNgrams :: Language -> Text -> IO [[SNgrams]]
  49 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
  50
  51
  52 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
  53 extractNgrams' lang t =  map (map token2text)
  54                      <$> map _sentenceTokens
  55                      <$> _sentences
  56                      <$> corenlp lang t
  57
  58 -- | This function selects ngrams according to grammars specific
  59 --   of each language.
  60 --   In english, JJ is ADJectiv in french.
  61 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
  62 selectNgrams EN = En.selectNgrams
  63 selectNgrams FR = Fr.selectNgrams
  64
  65 -- | This function analyze and groups (or not) ngrams according to
  66 --   grammars specific of each language.
  67 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
  68 groupNgrams EN = En.groupNgrams
  69 groupNgrams FR = Fr.groupNgrams
  70