module Gargantext.Text.Metrics
where
-import Data.Ord (Down(..))
-import qualified Data.List as L
-
-import Data.Map (Map)
-import qualified Data.Map as M
-
+--import Data.Array.Accelerate ((:.)(..), Z(..))
--import Math.KMeans (kmeans, euclidSq, elements)
+--import GHC.Float (exp)
+import Data.Tuple.Extra (both)
+import Data.Map (Map)
+import Data.List.Extra (sortOn)
+import GHC.Real (round)
import Gargantext.Prelude
import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index
-
-import qualified Data.Array.Accelerate.Interpreter as DAA
+import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..))
import qualified Data.Array.Accelerate as DAA
--- import Data.Array.Accelerate ((:.)(..), Z(..))
-
-import GHC.Real (round)
+import qualified Data.Array.Accelerate.Interpreter as DAA
+import qualified Data.List as List
+import qualified Data.Map as Map
-import Debug.Trace (trace)
+import qualified Data.Vector.Storable as Vec
-data MapListSize = MapListSize Int
-data InclusionSize = InclusionSize Int
-data SampleBins = SampleBins Double
-data Clusters = Clusters Int
-data DefaultValue = DefaultValue Int
+type GraphListSize = Int
+type InclusionSize = Int
-data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
- , fc_inclusionSize :: InclusionSize
- , fc_sampleBins :: SampleBins
- , fc_clusters :: Clusters
- , fc_defaultValue :: DefaultValue
- }
+{-
+toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
+toScored' = map2scored
+ . (pcaReduceTo (Dimension 2))
+ . (Map.filter (\v -> Vec.length v > 1))
+ . (Map.unionsWith (<>))
+-}
-filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
-filterCooc fc cc = (filterCooc' fc) ts cc
+scored :: Ord t => Map (t,t) Int -> [Scored t]
+scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map
+ where
+ scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+ scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
+
+ map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
+ map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
+
+-- TODO change type with (x,y)
+data Scored ts = Scored
+ { _scored_terms :: !ts
+ , _scored_incExc :: !InclusionExclusion
+ , _scored_speGen :: !SpecificityGenericity
+ } deriving (Show)
+
+localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
+ (Map.toList fi)
+ scores
where
- ts = map _scored_terms $ takeSome fc $ coocScored cc
+ (ti, fi) = createIndices m
+ (is, ss) = incExcSpeGen $ cooc2mat ti m
+ scores = DAA.toList
+ $ DAA.run
+ $ DAA.zip (DAA.use is) (DAA.use ss)
+
-filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
- foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
- M.empty selection
+-- TODO Code to be remove below
+-- TODO in the textflow we end up needing these indices , it might be
+-- better to compute them earlier and pass them around.
+scored' :: Ord t => Map (t,t) Int -> [Scored t]
+scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) scores
where
- selection = [(x,y) | x <- ts
- , y <- ts
- , x > y
- ]
-
-
--- | Map list creation
--- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
--- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
--- each parts is then ordered by Inclusion/Exclusion
--- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
-takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
- $ takeSample n m
- $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
- -- splitKmeans k scores
+ (ti, fi) = createIndices m
+ (is, ss) = incExcSpeGen $ cooc2mat ti m
+ scores = DAA.toList
+ $ DAA.run
+ $ DAA.zip (DAA.use is) (DAA.use ss)
+
+
+takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> ([t],[t])
+takeScored listSize incSize = both (map _scored_terms)
+ . linearTakes listSize incSize _scored_speGen
+ _scored_incExc
+ . scored
+
+
+-- | Filter Scored data
+-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
+-- [(3,8),(6,5)]
+linearTakes :: (Ord b1, Ord b2)
+ => GraphListSize -> InclusionSize
+ -> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
+linearTakes mls incSize speGen incExc = (List.splitAt mls)
+ . List.concat
+ . map (take $ round
+ $ (fromIntegral mls :: Double)
+ / (fromIntegral incSize :: Double)
+ )
+ . map (sortOn speGen)
+ . splitEvery incSize
+ . take 5000
+ . takePercent (0.70)
+ . sortOn incExc
+
+takePercent :: Double -> [a] -> [a]
+takePercent l xs = List.take l' xs
where
- -- TODO: benchmark with accelerate-example kmeans version
- --splitKmeans x xs = L.concat $ map elements
- -- $ V.take (k-1)
- -- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
- -- euclidSq x xs
- n = round ((fromIntegral l)/s)
- m = round $ (fromIntegral $ length scores) / (s)
- takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
- L.concat $ map (L.take n')
- $ map (sortWith (Down . _scored_incExc))
- -- TODO use kmeans s instead of splitEvery
- -- in order to split in s heteregenous parts
- -- without homogeneous order hypothesis
- $ splitEvery m'
- $ sortWith (Down . _scored_speGen) xs
-
-
-data Scored ts = Scored { _scored_terms :: !ts
- , _scored_incExc :: !InclusionExclusion
- , _scored_speGen :: !SpecificityGenericity
- } deriving (Show)
-
--- TODO in the textflow we end up needing these indices, it might be better
--- to compute them earlier and pass them around.
-coocScored :: Ord t => Map (t,t) Int -> [Scored t]
-coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+ l' = round $ l * (fromIntegral $ List.length xs)
+
+splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
+splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
where
- (ti,fi) = createIndices m
- (is, ss) = incExcSpeGen $ cooc2mat ti m
- scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
+ (mpa, ca) = List.splitAt a $ List.filter af xs
+ (mpb, cb) = List.splitAt b $ List.filter bf xs
+