src/Gargantext/Ngrams.hs

   1 {-|
   2 Module      : Gargantext.Ngrams
   3 Description : Ngrams tools
   4 Copyright   : (c) CNRS, 2018
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Ngrams exctration.
  11
  12 Definitions of ngrams.
  13 n non negative integer
  14
  15 -}
  16
  17 {-# LANGUAGE NoImplicitPrelude #-}
  18
  19 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
  20                               --, module Gargantext.Ngrams.Hetero
  21                          , module Gargantext.Ngrams.CoreNLP
  22                          , module Gargantext.Ngrams.Parser
  23                          , module Gargantext.Ngrams.Occurrences
  24                          , module Gargantext.Ngrams.TextMining
  25                          , module Gargantext.Ngrams.Metrics
  26                          , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
  27                          , ListName(..), equivNgrams, isGram, sentences
  28                          , ngramsTest
  29                              --, module Gargantext.Ngrams.Words
  30                          ) where
  31
  32 import Gargantext.Ngrams.Letters
  33 --import Gargantext.Ngrams.Hetero
  34 import Gargantext.Ngrams.CoreNLP
  35 import Gargantext.Ngrams.Parser
  36
  37 import Gargantext.Ngrams.Occurrences
  38 import Gargantext.Ngrams.TextMining
  39 --import Gargantext.Ngrams.Words
  40
  41 import Gargantext.Ngrams.Metrics
  42 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
  43 -----------------------------------------------------------------
  44
  45 import Data.List (sort)
  46 import Data.Char (Char, isAlphaNum, isSpace)
  47 import Data.Text (Text, filter, toLower, split, lines, concat)
  48 import qualified Data.Text as DT
  49 import Data.Text.IO (readFile)
  50
  51 import Data.Map.Strict  (Map
  52                         , empty
  53                         , insertWith, unionWith
  54                         , lookupIndex
  55                         --, fromList, keys
  56                         )
  57 import qualified Data.Map.Strict as M (filter)
  58 import Data.Foldable (foldl')
  59 import Gargantext.Prelude hiding (filter)
  60
  61 -- Maybe useful later:
  62 --import NLP.Stemmer (stem, Stemmer(..))
  63 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
  64 --import Language.Aspell.Options (ACOption(..))
  65
  66
  67 data ListName = Stop | Candidate | Graph
  68   deriving (Show, Eq)
  69
  70 data Ngrams = Ngrams { _ngramsNgrams   :: [Text]
  71                      , _ngramsStem     :: [Text]
  72                      , _ngramsListName :: Maybe ListName
  73                      } deriving (Show)
  74
  75 equivNgrams :: Ngrams -> Ngrams -> Bool
  76 equivNgrams  (Ngrams n1 s1 _) (Ngrams n2 s2 _)
  77   = (sort n1) == (sort n2) || (sort s1) == (sort s2)
  78
  79 type Occ     = Int
  80 --type Index   = Int
  81
  82 -- Data Ngrams = Monograms | MultiGrams
  83
  84 ngrams :: Text -> [Text]
  85 ngrams xs = monograms $ toLower $ filter isGram xs
  86
  87 clean :: Text -> Text
  88 clean txt = DT.map clean' txt
  89   where
  90     clean' '’' = '\''
  91     clean' c  = c
  92
  93 monograms :: Text -> [Text]
  94 monograms txt = split isWord txt
  95   where
  96     isWord c = c `elem` [' ', '\'', ',', ';']
  97
  98 isGram :: Char -> Bool
  99 isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
 100
 101 -- | Compute the occurrences (occ)
 102 occ :: Ord a => [a] -> Map a Occ
 103 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
 104
 105 -- TODO add groups and filter stops
 106 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
 107 sumOcc xs = foldl' (unionWith (+)) empty xs
 108
 109 --noApax :: Ord a => Map a Occ -> Map a Occ
 110 --noApax m = M.filter (>1) m
 111
 112 -- | /!\ indexes are not the same:
 113
 114 -- | Index ngrams from Map
 115 --indexNgram :: Ord a => Map a Occ -> Map Index a
 116 --indexNgram m = fromList (zip [1..] (keys m))
 117
 118 -- | Index ngrams from Map
 119 --ngramIndex :: Ord a => Map a Occ -> Map a Index
 120 --ngramIndex m = fromList (zip (keys m) [1..])
 121
 122 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
 123 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
 124
 125 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
 126 indexIt xs = (m, is)
 127   where
 128     m  = sumOcc (map occ  xs)
 129     is = map    (indexWith m) xs
 130
 131 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
 132 list2fis n xs = (m', fs)
 133   where
 134     (m, is) = indexIt xs
 135     m'      = M.filter (>50000) m
 136     fs      = FIS.all n is
 137
 138 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 139 text2fis n xs = list2fis n (map ngrams xs)
 140
 141 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 142 --text2fisWith = undefined
 143
 144 -------------------------------------------------------------------
 145 -- Contexts of text
 146
 147 sentences :: Text -> [Text]
 148 sentences txt = split isStop txt
 149
 150 isStop :: Char -> Bool
 151 isStop c = c `elem` ['.','?','!']
 152
 153
 154 -- | Tests
 155 -- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
 156 ngramsTest :: (IO [Text], IO [Text], IO (Map Text Occ))
 157 ngramsTest =  (ws, ls, ocs)
 158   where
 159     txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
 160     -- | Number of sentences
 161     ls   = sentences <$> txt
 162     -- | Number of monograms used in the full text
 163     ws   = ngrams    <$> txt
 164     -- | stem ngrams
 165     -- TODO
 166     -- group ngrams
 167     ocs  = occ       <$> ws
 168
 169
 170
 171