working on level links

[gargantext.git] / src / Gargantext / Text / Metrics.hs
diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs

index 4d1a5462719e814c00e73552a9cf757c7dbf4e2e..304026da4411811701cb9d2a973371fef3e2a3e4 100644 (file)
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -9,207 +9,129 @@ Portability : POSIX
  
  Mainly reexport functions in @Data.Text.Metrics@
  
-
-TODO
-noApax :: Ord a => Map a Occ -> Map a Occ
-noApax m = M.filter (>1) m
-
  -}
  
  {-# LANGUAGE BangPatterns      #-}
  {-# LANGUAGE NoImplicitPrelude #-}
  {-# LANGUAGE OverloadedStrings #-}
  
-module Gargantext.Text.Metrics 
+module Gargantext.Text.Metrics
    where
  
-import Data.Text (Text, pack)
-import Data.Ord (comparing, Down(..))
-import Data.Map (Map)
-import qualified Data.List as L
-import qualified Data.Map  as M
-import qualified Data.Set  as S
-import qualified Data.Text as T
-import qualified Data.Vector as V
-import qualified Data.Vector.Unboxed as VU
-import Data.Tuple.Extra (both)
---import GHC.Real (Ratio)
---import qualified Data.Text.Metrics as DTM
-import Data.Array.Accelerate (toList)
-import Math.KMeans (kmeans, euclidSq, elements)
+--import Data.Array.Accelerate ((:.)(..), Z(..))
+--import Math.KMeans (kmeans, euclidSq, elements)
  
+--import GHC.Float (exp)
  
+import Data.Map (Map)
+import Data.List.Extra (sortOn)
+import GHC.Real (round)
  import Gargantext.Prelude
-
-import Gargantext.Text.Metrics.Count (occurrences, cooc)
-import Gargantext.Text.Terms (TermType(MonoMulti), terms)
-import Gargantext.Core (Lang(EN))
-import Gargantext.Core.Types (Terms(..))
-import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
-
  import Gargantext.Viz.Graph.Distances.Matrice
  import Gargantext.Viz.Graph.Index
-
-import qualified Data.Array.Accelerate.Interpreter as DAA
  import qualified Data.Array.Accelerate as DAA
--- import Data.Array.Accelerate ((:.)(..), Z(..))
-
-import GHC.Real (round)
-
-import Debug.Trace
-import Prelude (seq)
-
-data MapListSize   = MapListSize   Int
-data InclusionSize = InclusionSize Int
-data SampleBins    = SampleBins    Double
-data Clusters      = Clusters      Int
-data DefaultValue  = DefaultValue  Int
-
-data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
-                                 , fc_inclusionSize :: InclusionSize
-                                 , fc_sampleBins    :: SampleBins
-                                 , fc_clusters      :: Clusters
-                                 , fc_defaultValue  :: DefaultValue
-                             }
-
-filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
-filterCooc fc cc = (filterCooc' fc) ts cc
-  where
-    ts     = map _scored_terms $ takeSome fc $ coocScored cc
-
+import qualified Data.Array.Accelerate.Interpreter as DAA
+import qualified Data.List as List
+import qualified Data.Map  as Map
  
-filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
-  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
-    M.empty selection
-  where
-    selection  = [(x,y) | x <- ts
-                        , y <- ts
-                       -- , x >= y
-                        ]
-
-
--- | Map list creation
--- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
--- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
--- each parts is then ordered by Inclusion/Exclusion
--- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
-takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
-                    $ takeSample n m
-                    $ L.take l' $ sortWith (Down . _scored_incExc) scores
-                    -- $ splitKmeans k scores
-  where
-    -- TODO: benchmark with accelerate-example kmeans version
-    splitKmeans x xs = L.concat $ map elements
-                     $ V.take (k-1)
-                     $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
-                              euclidSq x xs
-    n = round ((fromIntegral l)/s)
-    m = round $ (fromIntegral $ length scores) / (s)
-    takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
-                        L.concat $ map (L.take n)
-                                 $ map (sortWith (Down . _scored_incExc))
-                                 -- TODO use kmeans s instead of splitEvery
-                                 -- in order to split in s heteregenous parts
-                                 -- without homogeneous order hypothesis
-                                 $ splitEvery m
-                                 $ sortWith (Down . _scored_speGen) xs
-
-
-data Scored t = Scored { _scored_terms  :: !t
-                       , _scored_incExc :: !InclusionExclusion
-                       , _scored_speGen :: !SpecificityGenericity
-                     } deriving (Show)
-
-coocScored :: Ord t => Map (t,t) Int -> [Scored t]
-coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
-  where
-    (ti,fi) = createIndices m
-    (is, ss) = incExcSpeGen $ cooc2mat ti m
-    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
+import Numeric.Statistics.PCA (pcaReduceN)
+import qualified Data.Vector.Storable as Vec
+import Data.Array.IArray (Array, listArray, elems)
  
+type GraphListSize = Int
+type InclusionSize = Int
  
  
  
+toScored :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t] 
+toScored = map2scored
+         . (reduceTo (Dimension 2))
+         . (Map.filter (\v -> Vec.length v > 1))
+         . (Map.unionsWith (<>))
  
  
+scored :: Ord t => Map (t,t) Int -> [Scored t]
+scored = map2scored . (reduceTo (Dimension 2)) . scored2map
  
+scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
  
+map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
+map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
  
+-- TODO change type with (x,y)
+data Scored ts = Scored
+  { _scored_terms  :: !ts
+  , _scored_incExc :: !InclusionExclusion
+  , _scored_speGen :: !SpecificityGenericity
+  } deriving (Show)
  
+data Dimension = Dimension Int
  
+reduceTo :: Ord t
+         => Dimension
+         -> Map t (Vec.Vector Double)
+         -> Map t (Vec.Vector Double)
+reduceTo (Dimension d) ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d
+  where
+    ss'' :: Array Int (Vec.Vector Double)
+    ss'' = listArray (1, List.length ss') ss'
  
+    (txts,ss') = List.unzip $ Map.toList ss
  
  
+localMetrics :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+localMetrics m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
+                                       (Map.toList fi)
+                                       scores
+  where
+    (ti, fi) = createIndices m
+    (is, ss) = incExcSpeGen $ cooc2mat ti m
+    scores   = DAA.toList
+             $ DAA.run
+             $ DAA.zip (DAA.use is) (DAA.use ss)
  
  
  
  
-incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
-incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
+-- TODO Code to be remove below
+-- TODO in the textflow we end up needing these indices , it might be
+-- better to compute them earlier and pass them around.
+scored' :: Ord t => Map (t,t) Int -> [Scored t]
+scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) scores
    where
-    (ti,fi) = createIndices m
-    ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
+    (ti, fi) = createIndices m
+    (is, ss) = incExcSpeGen $ cooc2mat ti m
+    scores   = DAA.toList
+             $ DAA.run
+             $ DAA.zip (DAA.use is) (DAA.use ss)
  
  
  
-metrics_text :: Text
-metrics_text = T.intercalate " " metrics_sentences
  
-metrics_sentences' :: [Text]
-metrics_sentences' = splitBy (Sentences 0) metrics_text
  
--- | Sentences 
-metrics_sentences :: [Text]
-metrics_sentences = [ "There is a table with a glass of wine and a spoon."
-                    , "I can see the glass on the table."
-                    , "There was only a spoon on that table."
-                    , "The glass just fall from the table, pouring wine everywhere."
-                    , "I wish the glass did not contain wine."
-                    ]
  
-metrics_sentences_Test = metrics_sentences == metrics_sentences'
+takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
+takeScored listSize incSize = map _scored_terms
+                            . linearTakes listSize incSize _scored_speGen
+                                                           _scored_incExc
+                            . scored
  
--- | Terms reordered to visually check occurrences
--- >>> 
-{- [ [["table"],["glass"],["wine"],["spoon"]]
-   , [["glass"],["table"]]
-   , [["spoon"],["table"]]
-   , [["glass"],["table"],["wine"]]
-   , [["glass"],["wine"]]
-   ]
--}
  
-metrics_terms :: IO [[Terms]]
-metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
+-- | Filter Scored data
+-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
+-- [(3,8),(6,5)]
+linearTakes :: (Ord b1, Ord b2)
+            => GraphListSize -> InclusionSize
+            -> (a -> b2) -> (a -> b1) -> [a] -> [a]
+linearTakes gls incSize speGen incExc = take gls
+                      . List.concat
+                      . map (take $ round
+                                  $ (fromIntegral gls     :: Double)
+                                  / (fromIntegral incSize :: Double)
+                             )
+                      . map (sortOn incExc)
+                      . splitEvery incSize
+                      . sortOn speGen
  
--- | Occurrences
-{-
-fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
-         , (fromList ["object"],fromList [(["object"], 3 )])
-         , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
-         , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
--}
-metrics_occ = occurrences <$> L.concat <$> metrics_terms
-
-{- 
--- fromList [((["glas"],["object"]),6)
-            ,((["glas"],["spoon"]),4)
-            ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
-
--}
-metrics_cooc = cooc <$> metrics_terms
-
-metrics_cooc_mat = do
-  m <- metrics_cooc
-  let (ti,_) = createIndices m
-  let mat_cooc = cooc2mat ti m
-  pure ( ti
-       , mat_cooc
-       , incExcSpeGen_proba  mat_cooc
-       , incExcSpeGen        mat_cooc
-       )
-
-metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc