2 Module : Gargantext.Ngrams
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 Definitions of ngrams.
13 n non negative integer
17 {-# LANGUAGE NoImplicitPrelude #-}
19 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
20 --, module Gargantext.Ngrams.Hetero
21 , module Gargantext.Ngrams.CoreNLP
22 , module Gargantext.Ngrams.Parser
23 , module Gargantext.Ngrams.Occurrences
24 , module Gargantext.Ngrams.TextMining
25 , module Gargantext.Ngrams.Metrics
26 , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
27 , ListName(..), equivNgrams, isGram, sentences
29 --, module Gargantext.Ngrams.Words
32 import Gargantext.Ngrams.Letters
33 --import Gargantext.Ngrams.Hetero
34 import Gargantext.Ngrams.CoreNLP
35 import Gargantext.Ngrams.Parser
37 import Gargantext.Ngrams.Occurrences
38 import Gargantext.Ngrams.TextMining
39 --import Gargantext.Ngrams.Words
41 import Gargantext.Ngrams.Metrics
42 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
43 -----------------------------------------------------------------
45 import Data.List (sort)
46 import Data.Char (Char, isAlphaNum, isSpace)
47 import Data.Text (Text, filter, toLower, split, lines, concat)
48 import qualified Data.Text as DT
49 import Data.Text.IO (readFile)
51 import Data.Map.Strict (Map
53 , insertWith, unionWith
57 import qualified Data.Map.Strict as M (filter)
58 import Data.Foldable (foldl')
59 import Gargantext.Prelude hiding (filter)
61 -- Maybe useful later:
62 --import NLP.Stemmer (stem, Stemmer(..))
63 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
64 --import Language.Aspell.Options (ACOption(..))
67 data ListName = Stop | Candidate | Graph
70 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
71 , _ngramsStem :: [Text]
72 , _ngramsListName :: Maybe ListName
75 equivNgrams :: Ngrams -> Ngrams -> Bool
76 equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
77 = (sort n1) == (sort n2) || (sort s1) == (sort s2)
82 -- Data Ngrams = Monograms | MultiGrams
84 ngrams :: Text -> [Text]
85 ngrams xs = monograms $ toLower $ filter isGram xs
88 clean txt = DT.map clean' txt
93 monograms :: Text -> [Text]
94 monograms txt = split isWord txt
96 isWord c = c `elem` [' ', '\'', ',', ';']
98 isGram :: Char -> Bool
99 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
101 -- | Compute the occurrences (occ)
102 occ :: Ord a => [a] -> Map a Occ
103 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
105 -- TODO add groups and filter stops
106 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
107 sumOcc xs = foldl' (unionWith (+)) empty xs
109 --noApax :: Ord a => Map a Occ -> Map a Occ
110 --noApax m = M.filter (>1) m
112 -- | /!\ indexes are not the same:
114 -- | Index ngrams from Map
115 --indexNgram :: Ord a => Map a Occ -> Map Index a
116 --indexNgram m = fromList (zip [1..] (keys m))
118 -- | Index ngrams from Map
119 --ngramIndex :: Ord a => Map a Occ -> Map a Index
120 --ngramIndex m = fromList (zip (keys m) [1..])
122 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
123 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
125 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
128 m = sumOcc (map occ xs)
129 is = map (indexWith m) xs
131 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
132 list2fis n xs = (m', fs)
135 m' = M.filter (>50000) m
138 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
139 text2fis n xs = list2fis n (map ngrams xs)
141 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
142 --text2fisWith = undefined
144 -------------------------------------------------------------------
147 sentences :: Text -> [Text]
148 sentences txt = split isStop txt
150 isStop :: Char -> Bool
151 isStop c = c `elem` ['.','?','!']
155 -- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
156 ngramsTest :: (IO [Text], IO [Text], IO (Map Text Occ))
157 ngramsTest = (ws, ls, ocs)
159 txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
160 -- | Number of sentences
161 ls = sentences <$> txt
162 -- | Number of monograms used in the full text