]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/FrequentItemSet.hs
Merge branch 'master' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext
[gargantext.git] / src / Gargantext / Text / Metrics / FrequentItemSet.hs
1 {-|
2 Module : Gargantext.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Domain Specific Language to manage Frequent Item Set (FIS)
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Metrics.FrequentItemSet
17 ( Fis, Size(..)
18 , occ_hlcm, cooc_hlcm
19 , all, between
20 , fisWithSize
21 , fisWith
22 , fisWithSizePoly
23 , fisWithSizePoly2
24 , fisWithSizePolyMap
25 , module HLCM
26 )
27 where
28
29 import Prelude (Functor(..)) -- TODO
30 import qualified Data.Map.Strict as Map
31 import Data.Map.Strict (Map)
32 import qualified Data.Set as Set
33 import Data.Set (Set)
34 import qualified Data.Vector as V
35 import Data.Vector (Vector)
36
37 import Data.List (filter, concat)
38 import Data.Maybe (catMaybes)
39
40 import HLCM
41
42 import Gargantext.Prelude
43
44 data Size = Point Int | Segment Int Int
45
46 ------------------------------------------------------------------------
47 -- | Occurrence is Frequent Item Set of size 1
48 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
49 occ_hlcm = fisWithSize (Point 1)
50
51 -- | Cooccurrence is Frequent Item Set of size 2
52 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
53 cooc_hlcm = fisWithSize (Point 2)
54
55 all :: Frequency -> [[Item]] -> [Fis]
56 all = fisWith Nothing
57
58 ------------------------------------------------------------------------
59 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
60 between (x,y) = fisWithSize (Segment x y)
61
62 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
63 --maximum m = between (0,m)
64
65
66 ------------------------------------------------------------------------
67 ------------------------------------------------------------------------
68 -- | Data type to type the Frequent Item Set
69 -- TODO replace List with Set in fisItemSet
70 -- be careful : risks to erase HLCM behavior
71 type Fis = Fis' Item
72 data Fis' a = Fis' { _fisCount :: Int
73 , _fisItemSet :: [a]
74 } deriving (Show)
75
76 instance Functor Fis' where
77 fmap f (Fis' c is) = Fis' c (fmap f is)
78
79 -- | Sugar from items to FIS
80 items2fis :: [Item] -> Maybe Fis
81 items2fis [] = Nothing
82 items2fis (i:is) = Just $ Fis' i is
83
84 ------------------------------------------------------------------------
85 ------------------------------------------------------------------------
86
87 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
88 fisWithSize n f is = case n of
89 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
90 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
91 where
92 cond a' x b' = a' <= x && x <= b'
93
94
95 --- Filter on Fis and not on [Item]
96 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
97 fisWith s f is = catMaybes $ map items2fis $ filter' $ runLCMmatrix is f
98 -- drop unMaybe
99 where
100 filter' = case s of
101 Nothing -> identity
102 Just fun -> filter fun
103
104 -- Here the sole purpose to take the keys as a Set is tell we do not want
105 -- duplicates.
106 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
107 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
108 where
109 ksv = V.fromList $ Set.toList ks
110 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
111 toItem = (ksm Map.!)
112 fromItem = (ksv V.!)
113
114 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
115 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
116 where
117 ks = Set.fromList $ concat is
118
119 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
120 fisWithSizePolyMap n f is =
121 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
122
123
124 ------------------------------------------------------------------------
125 ------------------------------------------------------------------------
126
127
128 --
129 ---- | /!\ indexes are not the same:
130 --
131 ---- | Index ngrams from Map
132 ----indexNgram :: Ord a => Map a Occ -> Map Index a
133 ----indexNgram m = fromList (zip [1..] (keys m))
134 --
135 ---- | Index ngrams from Map
136 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
137 ----ngramIndex m = fromList (zip (keys m) [1..])
138 --
139 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
140 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
141 --
142 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
143 --indexIt xs = (m, is)
144 -- where
145 -- m = sumOcc (map occ xs)
146 -- is = map (indexWith m) xs
147 --
148 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
149 --list2fis n xs = (m', fs)
150 -- where
151 -- (m, is) = indexIt xs
152 -- m' = M.filter (>50000) m
153 -- fs = FIS.all n is
154 --
155 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
156 --text2fis n xs = list2fis n (map terms xs)
157 --
158 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
159 ----text2fisWith = undefined
160 --
161