2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Domain Specific Language to manage Frequent Item Set (FIS)
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
29 import Data.List (concat, null)
30 import Data.Map.Strict (Map)
31 import Data.Maybe (catMaybes)
33 import Gargantext.Prelude
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Vector as V
39 import Control.Monad (sequence)
41 data Size = Point Int | Segment Int Int
43 ------------------------------------------------------------------------
44 -- | Occurrence is Frequent Item Set of size 1
45 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
46 occ_hlcm = fisWithSize (Point 1)
48 -- | Cooccurrence is Frequent Item Set of size 2
49 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
50 cooc_hlcm = fisWithSize (Point 2)
52 allFis :: Frequency -> [[Item]] -> [Fis]
53 allFis = fisWith Nothing
55 ------------------------------------------------------------------------
56 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
57 between (x,y) = fisWithSize (Segment x y)
59 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
60 --maximum m = between (0,m)
63 ------------------------------------------------------------------------
64 ------------------------------------------------------------------------
65 -- | Data type to type the Frequent Item Set
66 -- TODO replace List with Set in fisItemSet
67 -- be careful : risks to erase HLCM behavior
69 data Fis' a = Fis' { _fisCount :: Int
73 instance Functor Fis' where
74 fmap f (Fis' c is) = Fis' c (fmap f is)
76 -- | Sugar from items to FIS
77 items2fis :: [Item] -> Maybe Fis
78 items2fis [] = Nothing
79 items2fis (i:is) = Just $ Fis' i is
81 ------------------------------------------------------------------------
82 ------------------------------------------------------------------------
84 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
85 fisWithSize n f is = case n of
86 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
87 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
89 cond a' x b' = a' <= x && x <= b'
92 --- Filter on Fis and not on [Item]
93 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
94 fisWith s f is = case filter (not . null) is of
96 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
101 Just fun -> filter fun
103 -- Here the sole purpose to take the keys as a Set is tell we do not want
105 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
106 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
108 ksv = V.fromList $ Set.toList ks
109 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
113 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
114 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
116 ks = Set.fromList $ concat is
118 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
119 fisWithSizePolyMap n f is =
120 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
123 ------------------------------------------------------------------------
124 ------------------------------------------------------------------------
126 ---- Weighted [[Item]]
128 isSublistOf :: Ord a => [a] -> [a] -> Bool
129 isSublistOf sub lst = all (\i -> elem i lst) sub
131 reIndexFis :: Ord a => [([a],(b,c))] -> [Fis' a] -> [(Fis' a,([b],[c]))]
132 reIndexFis items fis = map (\f ->
133 let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
134 in (f, (map (fst . snd) docs,map (snd . snd) docs))) fis
136 wsum :: [Maybe Double] -> Maybe Double
137 wsum lst = fmap sum $ sequence lst
139 fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a], (Maybe Double,[Int]))] -> Map (Set a) (Int, (Maybe Double,[Int]))
140 fisWithSizePolyMap' n f is = Map.fromList
141 $ map (\(fis,(ws,sources)) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws,concat sources))))
143 $ fisWithSizePoly2 n f (map fst is)
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
150 ---- | /!\ indexes are not the same:
152 ---- | Index ngrams from Map
153 ----indexNgram :: Ord a => Map a Occ -> Map Index a
154 ----indexNgram m = fromList (zip [1..] (keys m))
156 ---- | Index ngrams from Map
157 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
158 ----ngramIndex m = fromList (zip (keys m) [1..])
160 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
161 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
163 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
164 --indexIt xs = (m, is)
166 -- m = sumOcc (map occ xs)
167 -- is = map (indexWith m) xs
169 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
170 --list2fis n xs = (m', fs)
172 -- (m, is) = indexIt xs
173 -- m' = M.filter (>50000) m
176 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
177 --text2fis n xs = list2fis n (map terms xs)
179 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
180 ----text2fisWith = undefined