src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE ScopedTypeVariables #-}
  13 {-# LANGUAGE TemplateHaskell     #-}
  14 {-# LANGUAGE BangPatterns        #-}
  15
  16 module Gargantext.Core.Text.List
  17   where
  18
  19 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
  20 import Data.HashMap.Strict (HashMap)
  21 import Data.HashSet (HashSet)
  22 import Data.Map (Map)
  23 import Data.Monoid (mempty)
  24 import Data.Ord (Down(..))
  25 import Data.Set (Set)
  26 import Data.Text (Text)
  27 import Data.Tuple.Extra (both)
  28 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
  29 import Gargantext.Core.NodeStory
  30 import Gargantext.Core.Text (size)
  31 import Gargantext.Core.Text.List.Group
  32 import Gargantext.Core.Text.List.Group.Prelude
  33 import Gargantext.Core.Text.List.Group.WithStem
  34 import Gargantext.Core.Text.List.Social
  35 import Gargantext.Core.Text.List.Social.Prelude
  36 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
  37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  38 import Gargantext.Core.Types.Individu (User(..))
  39 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
  40 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
  41 import Gargantext.Database.Admin.Types.Node (NodeId)
  42 import Gargantext.Database.Prelude (CmdM)
  43 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
  44 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
  45 import Gargantext.Database.Query.Table.Node (defaultList)
  46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
  48 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
  49 import Gargantext.Prelude
  50 import qualified Data.HashMap.Strict as HashMap
  51 import qualified Data.HashSet as HashSet
  52 import qualified Data.List    as List
  53 import qualified Data.Map     as Map
  54 import qualified Data.Set     as Set
  55 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  56
  57 {-
  58 -- TODO maybe useful for later
  59 isStopTerm :: StopSize -> Text -> Bool
  60 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  61   where
  62     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
  63 -}
  64
  65
  66 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  67 buildNgramsLists :: ( HasNodeStory env err m
  68                     , CmdM     env err m
  69                     , HasTreeError err
  70                     , HasNodeError err
  71                     )
  72                  => User
  73                  -> UserCorpusId
  74                  -> MasterCorpusId
  75                  -> Maybe FlowSocialListWith
  76                  -> GroupParams
  77                  -> m (Map NgramsType [NgramsElement])
  78 buildNgramsLists user uCid mCid mfslw gp = do
  79   ngTerms     <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
  80   othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
  81                       [ (Authors   , MapListSize 9, MaxListSize 1000)
  82                       , (Sources   , MapListSize 9, MaxListSize 1000)
  83                       , (Institutes, MapListSize 9, MaxListSize 1000)
  84                       ]
  85
  86   pure $ Map.unions $ [ngTerms] <> othersTerms
  87
  88
  89 data MapListSize = MapListSize { unMapListSize :: !Int }
  90 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
  91
  92 buildNgramsOthersList :: ( HasNodeError err
  93                          , CmdM     env err m
  94                          , HasNodeStory env err m
  95                          , HasTreeError err
  96                          )
  97                       => User
  98                       -> UserCorpusId
  99                       -> Maybe FlowSocialListWith
 100                       -> GroupParams
 101                       -> (NgramsType, MapListSize, MaxListSize)
 102                       -> m (Map NgramsType [NgramsElement])
 103 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
 104   allTerms  :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
 105
 106   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 107   socialLists :: FlowCont NgramsTerm FlowListScores
 108     <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
 109                                                       $ HashMap.fromList
 110                                                       $ List.zip (HashMap.keys allTerms)
 111                                                                  (List.cycle [mempty])
 112                                     )
 113   let
 114     groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
 115
 116     (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 117                            $ view flc_scores groupedWithList
 118
 119     (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
 120
 121     listSize = mapListSize - (List.length mapTerms)
 122     (mapTerms', candiTerms) = both HashMap.fromList
 123                             $ List.splitAt listSize
 124                             $ List.take maxListSize
 125                             $ List.sortOn (Down . viewScore . snd)
 126                             $ HashMap.toList tailTerms'
 127
 128
 129   pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
 130                           <> (toNgramsElement mapTerms )
 131                           <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
 132                           <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
 133                           )]
 134
 135
 136 getGroupParams :: ( HasNodeError err
 137                   , CmdM     env err m
 138                   , HasNodeStory env err m
 139                   , HasTreeError err
 140                   )
 141                => GroupParams -> HashSet Ngrams -> m GroupParams
 142 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
 143   !hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
 144   -- printDebug "hashMap" hashMap
 145   pure $ over gwl_map (\x -> x <> hashMap) gp
 146 getGroupParams gp _ = pure gp
 147
 148
 149 -- TODO use ListIds
 150 buildNgramsTermsList :: ( HasNodeError err
 151                         , CmdM     env err m
 152                         , HasNodeStory env err m
 153                         , HasTreeError err
 154                         )
 155                      => User
 156                      -> UserCorpusId
 157                      -> MasterCorpusId
 158                      -> Maybe FlowSocialListWith
 159                      -> GroupParams
 160                      -> (NgramsType, MapListSize)
 161                      -> m (Map NgramsType [NgramsElement])
 162 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
 163
 164 -- Filter 0 With Double
 165 -- Computing global speGen score
 166   printDebug "[buildNgramsTermsList: Sample List] / start" nt
 167   !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt
 168
 169   printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
 170
 171   printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
 172
 173   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 174   !(socialLists :: FlowCont NgramsTerm FlowListScores)
 175     <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
 176                                                       $ HashMap.fromList
 177                                                       $ List.zip (HashMap.keys   allTerms)
 178                                                                  (List.cycle     [mempty])
 179                                     )
 180   printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
 181
 182   let !ngramsKeys = HashSet.fromList $ List.take 1000 $ HashSet.toList $ HashMap.keysSet allTerms
 183
 184   printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)
 185
 186   !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
 187
 188   printDebug "[buildNgramsTermsList: groupParams']" (""::Text)
 189
 190   let
 191     !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
 192   --printDebug "socialLists_Stemmed" socialLists_Stemmed
 193     !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
 194     !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 195                                 $ HashMap.filter (\g -> (view gts'_score g) > 1)
 196                                 $ view flc_scores groupedWithList
 197
 198     !(groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
 199
 200   printDebug "[buildNgramsTermsList] stopTerms" stopTerms
 201
 202   -- splitting monterms and multiterms to take proportional candidates
 203     -- use % of list if to big, or Int if too small
 204   let
 205     !listSizeGlobal = 2000 :: Double
 206     !monoSize = 0.4  :: Double
 207     !multSize = 1 - monoSize
 208
 209     splitAt n' ns = both (HashMap.fromListWith (<>))
 210                      $ List.splitAt (round $ n' * listSizeGlobal)
 211                      $ List.sortOn (viewScore . snd)
 212                      $ HashMap.toList ns
 213
 214     !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
 215     !(groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
 216
 217 -------------------------
 218 -- Filter 1 With Set NodeId and SpeGen
 219     !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
 220
 221   printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
 222
 223  -- TODO remove (and remove HasNodeError instance)
 224   !userListId    <- defaultList uCid
 225   !masterListId  <- defaultList mCid
 226
 227   !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
 228                                             [userListId, masterListId]
 229                                             nt
 230                                             selectedTerms
 231
 232
 233   printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
 234
 235   let
 236     groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
 237     !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
 238                                 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
 239
 240
 241   printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
 242
 243   -- Coocurrences computation
 244   --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
 245   let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
 246               $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 247                            | (t1, s1) <- mapStemNodeIds
 248                            , (t2, s2) <- mapStemNodeIds
 249                            ]
 250           where
 251             mapStemNodeIds = HashMap.toList
 252                            $ HashMap.map viewScores
 253                            $ groupedTreeScores_SetNodeId
 254   let
 255     -- computing scores
 256     mapScores f = HashMap.fromList
 257                 $ map (\g -> (view scored_terms g, f g))
 258                 $ normalizeGlobal
 259                 $ map normalizeLocal
 260                 $ scored'
 261                 $ Map.fromList -- TODO remove this
 262                 $ HashMap.toList mapCooc
 263
 264   let
 265     groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
 266     !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
 267
 268   let
 269     -- sort / partition / split
 270     -- filter mono/multi again
 271     !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
 272
 273       -- filter with max score
 274     partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
 275                                                    > (view scored_speExc $ view gts'_score g)
 276                                               )
 277
 278     !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 279     !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 280
 281   -- splitAt
 282   let
 283     -- use % of list if to big, or Int if to small
 284     !mapSize = 1000 :: Double
 285     !canSize = mapSize * 2 :: Double
 286
 287     !inclSize = 0.4  :: Double
 288     !exclSize = 1 - inclSize
 289
 290     splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
 291     sortOn   f       = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
 292
 293     monoInc_size n = splitAt' n $ monoSize * inclSize / 2
 294     multExc_size n = splitAt' n $ multSize * exclSize / 2
 295
 296
 297     !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
 298     !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
 299
 300     !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
 301     !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
 302
 303
 304     !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
 305     !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
 306
 307     !(canMulScoredInclHead, _)  = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
 308     !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
 309
 310 ------------------------------------------------------------
 311     -- Final Step building the Typed list
 312     -- Candidates Terms need to be filtered
 313   let
 314     !maps = setListType (Just MapTerm)
 315         $  mapMonoScoredInclHead
 316         <> mapMonoScoredExclHead
 317         <> mapMultScoredInclHead
 318         <> mapMultScoredExclHead
 319
 320     -- An original way to filter to start with
 321     !cands = setListType (Just CandidateTerm)
 322           $ canMonoScoredIncHead
 323           <> canMonoScoredExclHead
 324           <> canMulScoredInclHead
 325           <> canMultScoredExclHead
 326
 327   -- TODO count it too
 328     !cands' = setListType (Just CandidateTerm)
 329           {-\$  groupedMonoTail
 330           <>-} groupedMultTail
 331
 332     -- Quick FIX
 333     !candNgramsElement = List.take 1000
 334                       $ toNgramsElement cands <> toNgramsElement cands'
 335
 336     !result = Map.unionsWith (<>)
 337        [ Map.fromList [( nt, toNgramsElement maps
 338                           <> toNgramsElement stopTerms
 339                           <> candNgramsElement
 340                       )]
 341        ]
 342
 343   pure result