]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams.hs
[FEAT/STEM] implementing Porter lib into Gargantext for English language.
[gargantext.git] / src / Gargantext / Ngrams.hs
1 {-|
2 Module : Gargantext.Ngrams
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams exctration.
11
12 Definitions of ngrams.
13 n non negative integer
14
15 -}
16
17 {-# LANGUAGE NoImplicitPrelude #-}
18
19 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
20 --, module Gargantext.Ngrams.Hetero
21 , module Gargantext.Ngrams.CoreNLP
22 , module Gargantext.Ngrams.Parser
23 , module Gargantext.Ngrams.Occurrences
24 , module Gargantext.Ngrams.TextMining
25 , module Gargantext.Ngrams.Metrics
26 , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
27 , ListName(..), equivNgrams, isGram, sentences
28 , ngramsTest
29 --, module Gargantext.Ngrams.Words
30 ) where
31
32 import Gargantext.Ngrams.Letters
33 --import Gargantext.Ngrams.Hetero
34 import Gargantext.Ngrams.CoreNLP
35 import Gargantext.Ngrams.Parser
36
37 import Gargantext.Ngrams.Occurrences
38 import Gargantext.Ngrams.TextMining
39 --import Gargantext.Ngrams.Words
40
41 import Gargantext.Ngrams.Metrics
42 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
43 -----------------------------------------------------------------
44
45 import Data.List (sort)
46 import Data.Char (Char, isAlphaNum, isSpace)
47 import Data.Text (Text, filter, toLower, split, lines, concat)
48 import qualified Data.Text as DT
49 import Data.Text.IO (readFile)
50
51 import Data.Map.Strict (Map
52 , empty
53 , insertWith, unionWith
54 , lookupIndex
55 --, fromList, keys
56 )
57 import qualified Data.Map.Strict as M (filter)
58 import Data.Foldable (foldl')
59 import Gargantext.Prelude hiding (filter)
60
61 -- Maybe useful later:
62 --import NLP.Stemmer (stem, Stemmer(..))
63 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
64 --import Language.Aspell.Options (ACOption(..))
65
66
67 data ListName = Stop | Candidate | Graph
68 deriving (Show, Eq)
69
70 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
71 , _ngramsStem :: [Text]
72 , _ngramsListName :: Maybe ListName
73 } deriving (Show)
74
75 equivNgrams :: Ngrams -> Ngrams -> Bool
76 equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
77 = (sort n1) == (sort n2) || (sort s1) == (sort s2)
78
79 type Occ = Int
80 --type Index = Int
81
82 -- Data Ngrams = Monograms | MultiGrams
83
84 ngrams :: Text -> [Text]
85 ngrams xs = monograms $ toLower $ filter isGram xs
86
87 clean :: Text -> Text
88 clean txt = DT.map clean' txt
89 where
90 clean' '’' = '\''
91 clean' c = c
92
93 monograms :: Text -> [Text]
94 monograms txt = split isWord txt
95 where
96 isWord c = c `elem` [' ', '\'', ',', ';']
97
98 isGram :: Char -> Bool
99 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
100
101 -- | Compute the occurrences (occ)
102 occ :: Ord a => [a] -> Map a Occ
103 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
104
105 -- TODO add groups and filter stops
106 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
107 sumOcc xs = foldl' (unionWith (+)) empty xs
108
109 --noApax :: Ord a => Map a Occ -> Map a Occ
110 --noApax m = M.filter (>1) m
111
112 -- | /!\ indexes are not the same:
113
114 -- | Index ngrams from Map
115 --indexNgram :: Ord a => Map a Occ -> Map Index a
116 --indexNgram m = fromList (zip [1..] (keys m))
117
118 -- | Index ngrams from Map
119 --ngramIndex :: Ord a => Map a Occ -> Map a Index
120 --ngramIndex m = fromList (zip (keys m) [1..])
121
122 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
123 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
124
125 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
126 indexIt xs = (m, is)
127 where
128 m = sumOcc (map occ xs)
129 is = map (indexWith m) xs
130
131 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
132 list2fis n xs = (m', fs)
133 where
134 (m, is) = indexIt xs
135 m' = M.filter (>50000) m
136 fs = FIS.all n is
137
138 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
139 text2fis n xs = list2fis n (map ngrams xs)
140
141 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
142 --text2fisWith = undefined
143
144 -------------------------------------------------------------------
145 -- Contexts of text
146
147 sentences :: Text -> [Text]
148 sentences txt = split isStop txt
149
150 isStop :: Char -> Bool
151 isStop c = c `elem` ['.','?','!']
152
153
154 -- | Tests
155 -- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
156 ngramsTest = ws
157 where
158 txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
159 -- | Number of sentences
160 ls = sentences <$> txt
161 -- | Number of monograms used in the full text
162 ws = ngrams <$> txt
163 -- | stem ngrams
164 -- TODO
165 -- group ngrams
166 ocs = occ <$> ws
167
168
169
170