src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Text (Text, pack)
  27 import Data.Map (Map)
  28 import qualified Data.List as L
  29 import qualified Data.Map  as M
  30 import qualified Data.Set  as S
  31 import qualified Data.Text as T
  32 import qualified Data.Vector as V
  33 import qualified Data.Vector.Unboxed as VU
  34 import Data.Tuple.Extra (both)
  35 --import GHC.Real (Ratio)
  36 --import qualified Data.Text.Metrics as DTM
  37 import Data.Array.Accelerate (toList)
  38 import Math.KMeans (kmeans, euclidSq, elements)
  39
  40
  41 import Gargantext.Prelude
  42
  43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  45 import Gargantext.Core (Lang(EN))
  46 import Gargantext.Core.Types (Terms(..))
  47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  48
  49 import Gargantext.Viz.Graph.Distances.Matrice
  50 import Gargantext.Viz.Graph.Index
  51
  52 import qualified Data.Array.Accelerate.Interpreter as DAA
  53 import qualified Data.Array.Accelerate as DAA
  54 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  55
  56 import GHC.Real (round)
  57
  58 import Debug.Trace
  59 import Prelude (seq)
  60
  61 filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
  62 filterCooc cc = filterCooc' ts cc
  63   where
  64     ts     = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
  65
  66 filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
  67 filterCooc' ts m = -- trace ("coocScored " <> show (length ts)) $
  68   foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m')
  69     M.empty selection
  70   where
  71     errMessage = panic "Filter cooc: no key"
  72     selection  = [(x,y) | x <- ts, y <- ts, x > y]
  73
  74
  75 type MapListSize = Int
  76 type SampleBins  = Double
  77 type Clusters    = Int
  78
  79 -- | Map list creation
  80 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  81 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  82 -- each parts is then ordered by Inclusion/Exclusion
  83 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  84 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
  85 takeSome l s k scores = L.take l
  86                     $ takeSample n m
  87                     $ splitKmeans k scores
  88   where
  89     -- TODO: benchmark with accelerate-example kmeans version
  90     splitKmeans x xs = elements
  91                      $ V.head
  92                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
  93                               euclidSq x xs
  94     n = round ((fromIntegral l)/s)
  95     m = round $ (fromIntegral $ length scores) / (s)
  96     takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
  97                         L.concat $ map (L.take n)
  98                                  $ L.reverse $ map (L.sortOn _scored_incExc)
  99                                  -- TODO use kmeans s instead of splitEvery
 100                                  -- in order to split in s heteregenous parts
 101                                  -- without homogeneous order hypothesis
 102                                  $ splitEvery m
 103                                  $ L.reverse $ L.sortOn _scored_speGen xs
 104
 105
 106 data Scored t = Scored { _scored_terms  :: !t
 107                        , _scored_incExc :: !InclusionExclusion
 108                        , _scored_speGen :: !SpecificityGenericity
 109                      } deriving (Show)
 110
 111 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 112 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 113   where
 114     (ti,fi) = createIndices m
 115     (is, ss) = incExcSpeGen $ cooc2mat ti m
 116     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 138 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 139   where
 140     (ti,fi) = createIndices m
 141     ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
 142
 143
 144
 145 metrics_text :: Text
 146 metrics_text = T.intercalate " " metrics_sentences
 147
 148 metrics_sentences' :: [Text]
 149 metrics_sentences' = splitBy (Sentences 0) metrics_text
 150
 151 -- | Sentences
 152 metrics_sentences :: [Text]
 153 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 154                     , "I can see the glass on the table."
 155                     , "There was only a spoon on that table."
 156                     , "The glass just fall from the table, pouring wine everywhere."
 157                     , "I wish the glass did not contain wine."
 158                     ]
 159
 160 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 161
 162 -- | Terms reordered to visually check occurrences
 163 -- >>>
 164 {- [ [["table"],["glass"],["wine"],["spoon"]]
 165    , [["glass"],["table"]]
 166    , [["spoon"],["table"]]
 167    , [["glass"],["table"],["wine"]]
 168    , [["glass"],["wine"]]
 169    ]
 170 -}
 171
 172 metrics_terms :: IO [[Terms]]
 173 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
 174
 175 -- | Occurrences
 176 {-
 177 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 178          , (fromList ["object"],fromList [(["object"], 3 )])
 179          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 180          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 181 -}
 182 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 183
 184 {-
 185 -- fromList [((["glas"],["object"]),6)
 186             ,((["glas"],["spoon"]),4)
 187             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 188
 189 -}
 190 metrics_cooc = cooc <$> metrics_terms
 191
 192 metrics_cooc_mat = do
 193   m <- metrics_cooc
 194   let (ti,_) = createIndices m
 195   let mat_cooc = cooc2mat ti m
 196   pure ( ti
 197        , mat_cooc
 198        , incExcSpeGen_proba  mat_cooc
 199        , incExcSpeGen        mat_cooc
 200        )
 201
 202 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 203