src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE ScopedTypeVariables #-}
  13 {-# LANGUAGE TemplateHaskell     #-}
  14
  15 module Gargantext.Core.Text.List
  16   where
  17
  18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
  19 import Data.HashMap.Strict (HashMap)
  20 import Data.HashSet (HashSet)
  21 import Data.Map (Map)
  22 import Data.Monoid (mempty)
  23 import Data.Ord (Down(..))
  24 import Data.Set (Set)
  25 import Data.Tuple.Extra (both)
  26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
  27 import Gargantext.Core.Text (size)
  28 import Gargantext.Core.Text.List.Group
  29 import Gargantext.Core.Text.List.Group.Prelude
  30 import Gargantext.Core.Text.List.Group.WithStem
  31 import Gargantext.Core.Text.List.Social
  32 import Gargantext.Core.Text.List.Social.Prelude
  33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
  34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  35 import Gargantext.Core.Types.Individu (User(..))
  36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
  37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  38 import Gargantext.Database.Admin.Types.Node (NodeId)
  39 import Gargantext.Database.Prelude (CmdM)
  40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
  41 import Gargantext.Database.Query.Table.Node (defaultList)
  42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
  43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
  45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
  46 import Gargantext.Prelude
  47 import qualified Data.HashMap.Strict as HashMap
  48 import qualified Data.List    as List
  49 import qualified Data.Map     as Map
  50 import qualified Data.Set     as Set
  51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  52 import qualified Data.HashSet as HashSet
  53
  54 {-
  55 -- TODO maybe useful for later
  56 isStopTerm :: StopSize -> Text -> Bool
  57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  58   where
  59     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
  60 -}
  61
  62
  63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  64 buildNgramsLists :: ( RepoCmdM env err m
  65                     , CmdM     env err m
  66                     , HasTreeError err
  67                     , HasNodeError err
  68                     )
  69                  => GroupParams
  70                  -> User
  71                  -> UserCorpusId
  72                  -> MasterCorpusId
  73                  -> m (Map NgramsType [NgramsElement])
  74 buildNgramsLists gp user uCid mCid = do
  75   ngTerms     <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
  76   othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
  77                       [ (Authors   , MapListSize 9)
  78                       , (Sources   , MapListSize 9)
  79                       , (Institutes, MapListSize 9)
  80                       ]
  81
  82   pure $ Map.unions $ [ngTerms] <> othersTerms
  83
  84
  85 data MapListSize = MapListSize { unMapListSize :: !Int }
  86
  87 buildNgramsOthersList ::( HasNodeError err
  88                         , CmdM     env err m
  89                         , RepoCmdM env err m
  90                         , HasTreeError err
  91                         )
  92                         => User
  93                         -> UserCorpusId
  94                         -> GroupParams
  95                         -> (NgramsType, MapListSize)
  96                         -> m (Map NgramsType [NgramsElement])
  97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
  98   allTerms  :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
  99
 100   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 101   socialLists :: FlowCont NgramsTerm FlowListScores
 102     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 103                                                       $ HashMap.fromList
 104                                                       $ List.zip (HashMap.keys allTerms)
 105                                                                  (List.cycle [mempty])
 106                                            )
 107   let
 108     groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
 109
 110     (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 111                            $ view flc_scores groupedWithList
 112
 113     (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
 114
 115     listSize = mapListSize - (List.length mapTerms)
 116     (mapTerms', candiTerms) = both HashMap.fromList
 117                             $ List.splitAt listSize
 118                             $ List.sortOn (Down . viewScore . snd)
 119                             $ HashMap.toList tailTerms'
 120
 121
 122   pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
 123                           <> (toNgramsElement mapTerms )
 124                           <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
 125                           <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
 126                           )]
 127
 128
 129 getGroupParams :: ( HasNodeError err
 130                   , CmdM     env err m
 131                   , RepoCmdM env err m
 132                   , HasTreeError err
 133                   )
 134                => GroupParams -> HashSet Ngrams -> m GroupParams
 135 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
 136   hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
 137   -- printDebug "hashMap" hashMap
 138   pure $ over gwl_map (\x -> x <> hashMap) gp
 139 getGroupParams gp _ = pure gp
 140
 141
 142 -- TODO use ListIds
 143 buildNgramsTermsList :: ( HasNodeError err
 144                         , CmdM     env err m
 145                         , RepoCmdM env err m
 146                         , HasTreeError err
 147                         )
 148                         => User
 149                         -> UserCorpusId
 150                         -> MasterCorpusId
 151                         -> GroupParams
 152                         -> (NgramsType, MapListSize)
 153                         -> m (Map NgramsType [NgramsElement])
 154 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
 155
 156 -- Filter 0 With Double
 157 -- Computing global speGen score
 158   allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
 159
 160   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 161   socialLists :: FlowCont NgramsTerm FlowListScores
 162     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 163                                                       $ HashMap.fromList
 164                                                       $ List.zip (HashMap.keys   allTerms)
 165                                                                  (List.cycle     [mempty])
 166                                            )
 167   let ngramsKeys = HashMap.keysSet allTerms
 168
 169   groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
 170
 171   let
 172     socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
 173   --printDebug "socialLists_Stemmed" socialLists_Stemmed
 174     groupedWithList = toGroupedTree socialLists_Stemmed allTerms
 175     (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 176                                 $ HashMap.filter (\g -> (view gts'_score g) > 1)
 177                                 $ view flc_scores groupedWithList
 178
 179     (groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
 180
 181   -- printDebug "stopTerms" stopTerms
 182
 183   -- splitting monterms and multiterms to take proportional candidates
 184     -- use % of list if to big, or Int if too small
 185     listSizeGlobal = 2000 :: Double
 186     monoSize = 0.4  :: Double
 187     multSize = 1 - monoSize
 188
 189     splitAt n' ns = both (HashMap.fromListWith (<>))
 190                   $ List.splitAt (round $ n' * listSizeGlobal)
 191                   $ List.sortOn (viewScore . snd)
 192                   $ HashMap.toList ns
 193
 194     (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
 195     (groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
 196
 197 -------------------------
 198 -- Filter 1 With Set NodeId and SpeGen
 199     selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
 200
 201
 202  -- TODO remove (and remove HasNodeError instance)
 203   userListId    <- defaultList uCid
 204   masterListId  <- defaultList mCid
 205
 206   mapTextDocIds <- getNodesByNgramsOnlyUser uCid
 207                                             [userListId, masterListId]
 208                                             nt
 209                                             selectedTerms
 210
 211   let
 212     groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
 213     groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
 214                                 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
 215
 216
 217   printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
 218
 219   -- Coocurrences computation
 220   --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
 221   let mapCooc = HashMap.filter (>1) -- removing cooc of 1
 222               $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 223                            | (t1, s1) <- mapStemNodeIds
 224                            , (t2, s2) <- mapStemNodeIds
 225                            ]
 226           where
 227             mapStemNodeIds = HashMap.toList
 228                            $ HashMap.map viewScores
 229                            $ groupedTreeScores_SetNodeId
 230   let
 231     -- computing scores
 232     mapScores f = HashMap.fromList
 233                 $ map (\g -> (view scored_terms g, f g))
 234                 $ normalizeGlobal
 235                 $ map normalizeLocal
 236                 $ scored'
 237                 $ Map.fromList -- TODO remove this
 238                 $ HashMap.toList mapCooc
 239
 240   let
 241     groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
 242     groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
 243
 244   let
 245     -- sort / partition / split
 246     -- filter mono/multi again
 247     (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
 248
 249       -- filter with max score
 250     partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
 251                                                    > (view scored_speExc $ view gts'_score g)
 252                                               )
 253
 254     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 255     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 256
 257   -- splitAt
 258   let
 259     -- use % of list if to big, or Int if to small
 260     mapSize = 1000 :: Double
 261     canSize = mapSize * 5 :: Double
 262
 263     inclSize = 0.4  :: Double
 264     exclSize = 1 - inclSize
 265
 266     splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
 267     sortOn   f  = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
 268
 269     monoInc_size n = splitAt' n $ monoSize * inclSize / 2
 270     multExc_size n = splitAt' n $ multSize * exclSize / 2
 271
 272
 273     (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
 274     (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
 275
 276     (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
 277     (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
 278
 279
 280     (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
 281     (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
 282
 283     (canMulScoredInclHead, _)  = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
 284     (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
 285
 286 ------------------------------------------------------------
 287     -- Final Step building the Typed list
 288     -- Candidates Terms need to be filtered
 289   let
 290     maps = setListType (Just MapTerm)
 291         $  mapMonoScoredInclHead
 292         <> mapMonoScoredExclHead
 293         <> mapMultScoredInclHead
 294         <> mapMultScoredExclHead
 295
 296     -- An original way to filter to start with
 297     cands = setListType (Just CandidateTerm)
 298           $ canMonoScoredIncHead
 299           <> canMonoScoredExclHead
 300           <> canMulScoredInclHead
 301           <> canMultScoredExclHead
 302
 303   -- TODO count it too
 304     cands' = setListType (Just CandidateTerm)
 305           {-\$  groupedMonoTail
 306           <>-} groupedMultTail
 307
 308     -- Quick FIX
 309     candNgramsElement = List.take 5000
 310                       $ toNgramsElement cands <> toNgramsElement cands'
 311
 312     result = Map.unionsWith (<>)
 313        [ Map.fromList [( nt, toNgramsElement maps
 314                           <> toNgramsElement stopTerms
 315                           <> candNgramsElement
 316                       )]
 317        ]
 318
 319   pure result