2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Domain Specific Language to manage Frequent Item Set (FIS)
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
28 import Data.List (concat, null)
29 import Data.Map.Strict (Map)
30 import Data.Maybe (catMaybes)
32 import Gargantext.Prelude
34 import Prelude (Functor(..)) -- TODO
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Vector as V
39 data Size = Point Int | Segment Int Int
41 ------------------------------------------------------------------------
42 -- | Occurrence is Frequent Item Set of size 1
43 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
44 occ_hlcm = fisWithSize (Point 1)
46 -- | Cooccurrence is Frequent Item Set of size 2
47 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
48 cooc_hlcm = fisWithSize (Point 2)
50 allFis :: Frequency -> [[Item]] -> [Fis]
51 allFis = fisWith Nothing
53 ------------------------------------------------------------------------
54 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
55 between (x,y) = fisWithSize (Segment x y)
57 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
58 --maximum m = between (0,m)
61 ------------------------------------------------------------------------
62 ------------------------------------------------------------------------
63 -- | Data type to type the Frequent Item Set
64 -- TODO replace List with Set in fisItemSet
65 -- be careful : risks to erase HLCM behavior
67 data Fis' a = Fis' { _fisCount :: Int
71 instance Functor Fis' where
72 fmap f (Fis' c is) = Fis' c (fmap f is)
74 -- | Sugar from items to FIS
75 items2fis :: [Item] -> Maybe Fis
76 items2fis [] = Nothing
77 items2fis (i:is) = Just $ Fis' i is
79 ------------------------------------------------------------------------
80 ------------------------------------------------------------------------
82 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
83 fisWithSize n f is = case n of
84 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
85 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
87 cond a' x b' = a' <= x && x <= b'
90 --- Filter on Fis and not on [Item]
91 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
92 fisWith s f is = case filter (not . null) is of
94 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
99 Just fun -> filter fun
101 -- Here the sole purpose to take the keys as a Set is tell we do not want
103 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
104 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
106 ksv = V.fromList $ Set.toList ks
107 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
111 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
112 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
114 ks = Set.fromList $ concat is
116 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
117 fisWithSizePolyMap n f is =
118 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
121 ------------------------------------------------------------------------
122 ------------------------------------------------------------------------
126 ---- | /!\ indexes are not the same:
128 ---- | Index ngrams from Map
129 ----indexNgram :: Ord a => Map a Occ -> Map Index a
130 ----indexNgram m = fromList (zip [1..] (keys m))
132 ---- | Index ngrams from Map
133 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
134 ----ngramIndex m = fromList (zip (keys m) [1..])
136 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
137 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
139 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
140 --indexIt xs = (m, is)
142 -- m = sumOcc (map occ xs)
143 -- is = map (indexWith m) xs
145 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
146 --list2fis n xs = (m', fs)
148 -- (m, is) = indexIt xs
149 -- m' = M.filter (>50000) m
152 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
153 --text2fis n xs = list2fis n (map terms xs)
155 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
156 ----text2fisWith = undefined