[STACK] upgrade.

[gargantext.git] / src / Gargantext / Text / Metrics.hs
diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs

index a3fa52cfda161324ada0463da79d8f4b720b3325..5692b837ada29f292c59eff49ca605fe1a3011b4 100644 (file)
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -18,93 +18,110 @@ Mainly reexport functions in @Data.Text.Metrics@
  module Gargantext.Text.Metrics
    where
  
-import Data.Ord (Down(..))
-import qualified Data.List as L
-
-import Data.Map (Map)
-import qualified Data.Map  as M
-
+--import Data.Array.Accelerate ((:.)(..), Z(..))
  --import Math.KMeans (kmeans, euclidSq, elements)
  
+--import GHC.Float (exp)
+import Data.Tuple.Extra (both)
+import Data.Map (Map)
+import Data.List.Extra (sortOn)
+import GHC.Real (round)
  import Gargantext.Prelude
  import Gargantext.Viz.Graph.Distances.Matrice
  import Gargantext.Viz.Graph.Index
-
-import qualified Data.Array.Accelerate.Interpreter as DAA
+import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..))
  import qualified Data.Array.Accelerate as DAA
--- import Data.Array.Accelerate ((:.)(..), Z(..))
-
-import GHC.Real (round)
+import qualified Data.Array.Accelerate.Interpreter as DAA
+import qualified Data.List as List
+import qualified Data.Map  as Map
  
-import Debug.Trace (trace)
+import qualified Data.Vector.Storable as Vec
  
-data MapListSize   = MapListSize   Int
-data InclusionSize = InclusionSize Int
-data SampleBins    = SampleBins    Double
-data Clusters      = Clusters      Int
-data DefaultValue  = DefaultValue  Int
+type GraphListSize = Int
+type InclusionSize = Int
  
-data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
-                                 , fc_inclusionSize :: InclusionSize
-                                 , fc_sampleBins    :: SampleBins
-                                 , fc_clusters      :: Clusters
-                                 , fc_defaultValue  :: DefaultValue
-                             }
+{-
+toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t] 
+toScored' = map2scored
+         . (pcaReduceTo (Dimension 2))
+         . (Map.filter (\v -> Vec.length v > 1))
+         . (Map.unionsWith (<>))
+-}
  
-filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
-filterCooc fc cc = (filterCooc' fc) ts cc
+scored :: Ord t => Map (t,t) Int -> [Scored t]
+scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map
+  where
+    scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+    scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
+
+    map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
+    map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
+
+-- TODO change type with (x,y)
+data Scored ts = Scored
+  { _scored_terms  :: !ts
+  , _scored_incExc :: !InclusionExclusion
+  , _scored_speGen :: !SpecificityGenericity
+  } deriving (Show)
+
+localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
+                                       (Map.toList fi)
+                                       scores
    where
-    ts     = map _scored_terms $ takeSome fc $ coocScored cc
+    (ti, fi) = createIndices m
+    (is, ss) = incExcSpeGen $ cooc2mat ti m
+    scores   = DAA.toList
+             $ DAA.run
+             $ DAA.zip (DAA.use is) (DAA.use ss)
+
  
-filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
-  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
-    M.empty selection
+-- TODO Code to be remove below
+-- TODO in the textflow we end up needing these indices , it might be
+-- better to compute them earlier and pass them around.
+scored' :: Ord t => Map (t,t) Int -> [Scored t]
+scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) scores
    where
-    selection  = [(x,y) | x <- ts
-                        , y <- ts
-                        , x > y
-                        ]
-
-
--- | Map list creation
--- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
--- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
--- each parts is then ordered by Inclusion/Exclusion
--- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
-takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
-                    $ takeSample n m
-                    $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
-                    -- splitKmeans k scores
+    (ti, fi) = createIndices m
+    (is, ss) = incExcSpeGen $ cooc2mat ti m
+    scores   = DAA.toList
+             $ DAA.run
+             $ DAA.zip (DAA.use is) (DAA.use ss)
+
+
+takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> ([t],[t])
+takeScored listSize incSize = both (map _scored_terms)
+                            . linearTakes listSize incSize _scored_speGen
+                                                           _scored_incExc
+                            . scored
+
+
+-- | Filter Scored data
+-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
+-- [(3,8),(6,5)]
+linearTakes :: (Ord b1, Ord b2)
+            => GraphListSize -> InclusionSize
+            -> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
+linearTakes mls incSize speGen incExc = (List.splitAt mls)
+                      . List.concat
+                      . map (take $ round
+                                  $ (fromIntegral mls     :: Double)
+                                  / (fromIntegral incSize :: Double)
+                             )
+                      . map (sortOn speGen)
+                      . splitEvery incSize
+                      . take 5000
+                      . takePercent (0.70)
+                      . sortOn incExc
+
+takePercent :: Double -> [a] -> [a]
+takePercent l xs = List.take l' xs
    where
-    -- TODO: benchmark with accelerate-example kmeans version
-    --splitKmeans x xs = L.concat $ map elements
-    --                 $ V.take (k-1)
-    --                 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
-    --                          euclidSq x xs
-    n = round ((fromIntegral l)/s)
-    m = round $ (fromIntegral $ length scores) / (s)
-    takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
-                        L.concat $ map (L.take n')
-                                 $ map (sortWith (Down . _scored_incExc))
-                                 -- TODO use kmeans s instead of splitEvery
-                                 -- in order to split in s heteregenous parts
-                                 -- without homogeneous order hypothesis
-                                 $ splitEvery m'
-                                 $ sortWith (Down . _scored_speGen) xs
-
-
-data Scored ts = Scored { _scored_terms :: !ts
-                        , _scored_incExc :: !InclusionExclusion
-                        , _scored_speGen :: !SpecificityGenericity
-                        } deriving (Show)
-
--- TODO in the textflow we end up needing these indices, it might be better
--- to compute them earlier and pass them around.
-coocScored :: Ord t => Map (t,t) Int -> [Scored t]
-coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+    l' = round $ l * (fromIntegral $ List.length xs)
+
+splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
+splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
    where
-    (ti,fi) = createIndices m
-    (is, ss) = incExcSpeGen $ cooc2mat ti m
-    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
+    (mpa, ca) = List.splitAt a $ List.filter af xs
+    (mpb, cb) = List.splitAt b $ List.filter bf xs
+