src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Text (Text, pack)
  27 import Data.Ord (comparing, Down(..))
  28 import Data.Map (Map)
  29 import qualified Data.List as L
  30 import qualified Data.Map  as M
  31 import qualified Data.Set  as S
  32 import qualified Data.Text as T
  33 import qualified Data.Vector as V
  34 import qualified Data.Vector.Unboxed as VU
  35 import Data.Tuple.Extra (both)
  36 --import GHC.Real (Ratio)
  37 --import qualified Data.Text.Metrics as DTM
  38 import Data.Array.Accelerate (toList)
  39 import Math.KMeans (kmeans, euclidSq, elements)
  40
  41
  42 import Gargantext.Prelude
  43
  44 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  45 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  46 import Gargantext.Core (Lang(EN))
  47 import Gargantext.Core.Types (Terms(..))
  48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  49
  50 import Gargantext.Viz.Graph.Distances.Matrice
  51 import Gargantext.Viz.Graph.Index
  52
  53 import qualified Data.Array.Accelerate.Interpreter as DAA
  54 import qualified Data.Array.Accelerate as DAA
  55 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  56
  57 import GHC.Real (round)
  58
  59 import Debug.Trace
  60 import Prelude (seq)
  61
  62 data MapListSize   = MapListSize   Int
  63 data InclusionSize = InclusionSize Int
  64 data SampleBins    = SampleBins    Double
  65 data Clusters      = Clusters      Int
  66 data DefaultValue  = DefaultValue  Int
  67
  68 data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
  69                                  , fc_inclusionSize :: InclusionSize
  70                                  , fc_sampleBins    :: SampleBins
  71                                  , fc_clusters      :: Clusters
  72                                  , fc_defaultValue  :: DefaultValue
  73                              }
  74
  75 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
  76 filterCooc fc cc = (filterCooc' fc) ts cc
  77   where
  78     ts     = map _scored_terms $ takeSome fc $ coocScored cc
  79
  80
  81 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
  82 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
  83   foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
  84     M.empty selection
  85   where
  86     selection  = [(x,y) | x <- ts
  87                         , y <- ts
  88                        -- , x >= y
  89                         ]
  90
  91
  92 -- | Map list creation
  93 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  94 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  95 -- each parts is then ordered by Inclusion/Exclusion
  96 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  97 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
  98 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
  99                     $ takeSample n m
 100                     $ L.take l' $ sortWith (Down . _scored_incExc) scores
 101                     -- $ splitKmeans k scores
 102   where
 103     -- TODO: benchmark with accelerate-example kmeans version
 104     splitKmeans x xs = L.concat $ map elements
 105                      $ V.take (k-1)
 106                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
 107                               euclidSq x xs
 108     n = round ((fromIntegral l)/s)
 109     m = round $ (fromIntegral $ length scores) / (s)
 110     takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
 111                         L.concat $ map (L.take n)
 112                                  $ map (sortWith (Down . _scored_incExc))
 113                                  -- TODO use kmeans s instead of splitEvery
 114                                  -- in order to split in s heteregenous parts
 115                                  -- without homogeneous order hypothesis
 116                                  $ splitEvery m
 117                                  $ sortWith (Down . _scored_speGen) xs
 118
 119
 120 data Scored t = Scored { _scored_terms  :: !t
 121                        , _scored_incExc :: !InclusionExclusion
 122                        , _scored_speGen :: !SpecificityGenericity
 123                      } deriving (Show)
 124
 125 -- TODO in the textflow we end up needing these indices, it might be better
 126 -- to compute them earlier and pass them around.
 127 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 128 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 129   where
 130     (ti,fi) = createIndices m
 131     (is, ss) = incExcSpeGen $ cooc2mat ti m
 132     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 152 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 153   where
 154     (ti,fi) = createIndices m
 155     ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
 156
 157
 158
 159 metrics_text :: Text
 160 metrics_text = T.intercalate " " metrics_sentences
 161
 162 metrics_sentences' :: [Text]
 163 metrics_sentences' = splitBy (Sentences 0) metrics_text
 164
 165 -- | Sentences
 166 metrics_sentences :: [Text]
 167 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 168                     , "I can see the glass on the table."
 169                     , "There was only a spoon on that table."
 170                     , "The glass just fall from the table, pouring wine everywhere."
 171                     , "I wish the glass did not contain wine."
 172                     ]
 173
 174 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 175
 176 -- | Terms reordered to visually check occurrences
 177 -- >>>
 178 {- [ [["table"],["glass"],["wine"],["spoon"]]
 179    , [["glass"],["table"]]
 180    , [["spoon"],["table"]]
 181    , [["glass"],["table"],["wine"]]
 182    , [["glass"],["wine"]]
 183    ]
 184 -}
 185
 186 metrics_terms :: IO [[Terms]]
 187 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
 188
 189 -- | Occurrences
 190 {-
 191 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 192          , (fromList ["object"],fromList [(["object"], 3 )])
 193          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 194          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 195 -}
 196 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 197
 198 {-
 199 -- fromList [((["glas"],["object"]),6)
 200             ,((["glas"],["spoon"]),4)
 201             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 202
 203 -}
 204 metrics_cooc = cooc <$> metrics_terms
 205
 206 metrics_cooc_mat = do
 207   m <- metrics_cooc
 208   let (ti,_) = createIndices m
 209   let mat_cooc = cooc2mat ti m
 210   pure ( ti
 211        , mat_cooc
 212        , incExcSpeGen_proba  mat_cooc
 213        , incExcSpeGen        mat_cooc
 214        )
 215
 216 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 217