]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams.hs
[FEAT] Syntactic convention (proposition to be discussed).
[gargantext.git] / src / Gargantext / Ngrams.hs
1 {-|
2 Module : Gargantext.Ngrams
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams exctration.
11
12 Definitions of ngrams.
13 n non negative integer
14
15 -}
16
17 {-# LANGUAGE NoImplicitPrelude #-}
18
19 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
20 --, module Gargantext.Ngrams.Hetero
21 , module Gargantext.Ngrams.CoreNLP
22 , module Gargantext.Ngrams.Parser
23 , module Gargantext.Ngrams.Occurrences
24 , module Gargantext.Ngrams.TextMining
25 , module Gargantext.Ngrams.Metrics
26 , Ngrams(..), ngrams, occ, sumOcc, text2fis
27 , ListName(..), equivNgrams, isGram
28 --, module Gargantext.Ngrams.Words
29 ) where
30
31 import Gargantext.Ngrams.Letters
32 --import Gargantext.Ngrams.Hetero
33 import Gargantext.Ngrams.CoreNLP
34 import Gargantext.Ngrams.Parser
35
36 import Gargantext.Ngrams.Occurrences
37 import Gargantext.Ngrams.TextMining
38 --import Gargantext.Ngrams.Words
39
40 import Gargantext.Ngrams.Metrics
41 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
42 -----------------------------------------------------------------
43
44 import Data.List (sort)
45 import Data.Char (Char, isAlphaNum, isSpace)
46 import Data.Text (Text, words, filter, toLower)
47 import Data.Map.Strict (Map
48 , empty
49 , insertWith, unionWith
50 , lookupIndex
51 --, fromList, keys
52 )
53 import qualified Data.Map.Strict as M (filter)
54 import Data.Foldable (foldl')
55 import Gargantext.Prelude hiding (filter)
56
57 -- Maybe useful later:
58 --import NLP.Stemmer (stem, Stemmer(..))
59 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
60 --import Language.Aspell.Options (ACOption(..))
61
62
63 data ListName = Stop | Candidate | Graph
64 deriving (Show, Eq)
65
66 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
67 , _ngramsStem :: [Text]
68 , _ngramsListName :: Maybe ListName
69 } deriving (Show)
70
71 equivNgrams :: Ngrams -> Ngrams -> Bool
72 equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
73 = (sort n1) == (sort n2) || (sort s1) == (sort s2)
74
75 type Occ = Int
76 --type Index = Int
77
78 -- Data Ngrams = Monograms | MultiGrams
79
80 ngrams :: Text -> [Text]
81 ngrams xs = monograms $ toLower $ filter isGram xs
82
83 monograms :: Text -> [Text]
84 monograms = words
85
86 isGram :: Char -> Bool
87 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/']
88
89 -- | Compute the occurrences (occ)
90 occ :: Ord a => [a] -> Map a Occ
91 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
92
93 -- TODO add groups and filter stops
94 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
95 sumOcc xs = foldl' (unionWith (+)) empty xs
96
97 --noApax :: Ord a => Map a Occ -> Map a Occ
98 --noApax m = M.filter (>1) m
99
100 -- | /!\ indexes are not the same:
101
102 -- | Index ngrams from Map
103 --indexNgram :: Ord a => Map a Occ -> Map Index a
104 --indexNgram m = fromList (zip [1..] (keys m))
105
106 -- | Index ngrams from Map
107 --ngramIndex :: Ord a => Map a Occ -> Map a Index
108 --ngramIndex m = fromList (zip (keys m) [1..])
109
110 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
111 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
112
113 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
114 indexIt xs = (m, is)
115 where
116 m = sumOcc (map occ xs)
117 is = map (indexWith m) xs
118
119 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
120 list2fis n xs = (m', fs)
121 where
122 (m, is) = indexIt xs
123 m' = M.filter (>50000) m
124 fs = FIS.all n is
125
126 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
127 text2fis n xs = list2fis n (map ngrams xs)
128
129 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
130 --text2fisWith = undefined
131
132