]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
[CryptoRandom] wip
[gargantext.git] / src / Gargantext / Core / Text / Metrics / FrequentItemSet.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Domain Specific Language to manage Frequent Item Set (FIS)
11
12 -}
13
14
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
16 ( Fis, Size(..)
17 , occ_hlcm, cooc_hlcm
18 , allFis, between
19 , fisWithSize
20 , fisWith
21 , fisWithSizePoly
22 , fisWithSizePoly2
23 , fisWithSizePolyMap
24 , module HLCM
25 )
26 where
27
28 import Data.List (concat, null)
29 import Data.Map.Strict (Map)
30 import Data.Maybe (catMaybes)
31 import Data.Set (Set)
32 import Gargantext.Prelude
33 import HLCM
34 import Prelude (Functor(..)) -- TODO
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Vector as V
38
39 data Size = Point Int | Segment Int Int
40
41 ------------------------------------------------------------------------
42 -- | Occurrence is Frequent Item Set of size 1
43 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
44 occ_hlcm = fisWithSize (Point 1)
45
46 -- | Cooccurrence is Frequent Item Set of size 2
47 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
48 cooc_hlcm = fisWithSize (Point 2)
49
50 allFis :: Frequency -> [[Item]] -> [Fis]
51 allFis = fisWith Nothing
52
53 ------------------------------------------------------------------------
54 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
55 between (x,y) = fisWithSize (Segment x y)
56
57 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
58 --maximum m = between (0,m)
59
60
61 ------------------------------------------------------------------------
62 ------------------------------------------------------------------------
63 -- | Data type to type the Frequent Item Set
64 -- TODO replace List with Set in fisItemSet
65 -- be careful : risks to erase HLCM behavior
66 type Fis = Fis' Item
67 data Fis' a = Fis' { _fisCount :: Int
68 , _fisItemSet :: [a]
69 } deriving (Show)
70
71 instance Functor Fis' where
72 fmap f (Fis' c is) = Fis' c (fmap f is)
73
74 -- | Sugar from items to FIS
75 items2fis :: [Item] -> Maybe Fis
76 items2fis [] = Nothing
77 items2fis (i:is) = Just $ Fis' i is
78
79 ------------------------------------------------------------------------
80 ------------------------------------------------------------------------
81
82 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
83 fisWithSize n f is = case n of
84 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
85 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
86 where
87 cond a' x b' = a' <= x && x <= b'
88
89
90 --- Filter on Fis and not on [Item]
91 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
92 fisWith s f is = case filter (not . null) is of
93 [] -> []
94 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
95 -- drop unMaybe
96 where
97 filter' = case s of
98 Nothing -> identity
99 Just fun -> filter fun
100
101 -- Here the sole purpose to take the keys as a Set is tell we do not want
102 -- duplicates.
103 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
104 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
105 where
106 ksv = V.fromList $ Set.toList ks
107 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
108 toItem = (ksm Map.!)
109 fromItem = (ksv V.!)
110
111 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
112 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
113 where
114 ks = Set.fromList $ concat is
115
116 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
117 fisWithSizePolyMap n f is =
118 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
119
120
121 ------------------------------------------------------------------------
122 ------------------------------------------------------------------------
123
124
125 --
126 ---- | /!\ indexes are not the same:
127 --
128 ---- | Index ngrams from Map
129 ----indexNgram :: Ord a => Map a Occ -> Map Index a
130 ----indexNgram m = fromList (zip [1..] (keys m))
131 --
132 ---- | Index ngrams from Map
133 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
134 ----ngramIndex m = fromList (zip (keys m) [1..])
135 --
136 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
137 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
138 --
139 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
140 --indexIt xs = (m, is)
141 -- where
142 -- m = sumOcc (map occ xs)
143 -- is = map (indexWith m) xs
144 --
145 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
146 --list2fis n xs = (m', fs)
147 -- where
148 -- (m, is) = indexIt xs
149 -- m' = M.filter (>50000) m
150 -- fs = FIS.all n is
151 --
152 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
153 --text2fis n xs = list2fis n (map terms xs)
154 --
155 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
156 ----text2fisWith = undefined
157 --
158