src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE ScopedTypeVariables #-}
  13 {-# LANGUAGE TemplateHaskell     #-}
  14
  15 module Gargantext.Core.Text.List
  16   where
  17
  18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
  19 import Data.HashMap.Strict (HashMap)
  20 import Data.HashSet (HashSet)
  21 import Data.Map (Map)
  22 import Data.Monoid (mempty)
  23 import Data.Ord (Down(..))
  24 import Data.Set (Set)
  25 import Data.Tuple.Extra (both)
  26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
  27 import Gargantext.Core.NodeStory
  28 import Gargantext.Core.Text (size)
  29 import Gargantext.Core.Text.List.Group
  30 import Gargantext.Core.Text.List.Group.Prelude
  31 import Gargantext.Core.Text.List.Group.WithStem
  32 import Gargantext.Core.Text.List.Social
  33 import Gargantext.Core.Text.List.Social.Prelude
  34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
  35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  36 import Gargantext.Core.Types.Individu (User(..))
  37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
  38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  39 import Gargantext.Database.Admin.Types.Node (NodeId)
  40 import Gargantext.Database.Prelude (CmdM)
  41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
  42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
  43 import Gargantext.Database.Query.Table.Node (defaultList)
  44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
  46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
  47 import Gargantext.Prelude
  48 import qualified Data.HashMap.Strict as HashMap
  49 import qualified Data.HashSet as HashSet
  50 import qualified Data.List    as List
  51 import qualified Data.Map     as Map
  52 import qualified Data.Set     as Set
  53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  54
  55 {-
  56 -- TODO maybe useful for later
  57 isStopTerm :: StopSize -> Text -> Bool
  58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  59   where
  60     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
  61 -}
  62
  63
  64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  65 buildNgramsLists :: ( HasNodeStory env err m
  66                     , CmdM     env err m
  67                     , HasTreeError err
  68                     , HasNodeError err
  69                     )
  70                  => GroupParams
  71                  -> User
  72                  -> UserCorpusId
  73                  -> MasterCorpusId
  74                  -> m (Map NgramsType [NgramsElement])
  75 buildNgramsLists gp user uCid mCid = do
  76   ngTerms     <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
  77   othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
  78                       [ (Authors   , MapListSize 9)
  79                       , (Sources   , MapListSize 9)
  80                       , (Institutes, MapListSize 9)
  81                       ]
  82
  83   pure $ Map.unions $ [ngTerms] <> othersTerms
  84
  85
  86 data MapListSize = MapListSize { unMapListSize :: !Int }
  87
  88 buildNgramsOthersList ::( HasNodeError err
  89                         , CmdM     env err m
  90                         , HasNodeStory env err m
  91                         , HasTreeError err
  92                         )
  93                         => User
  94                         -> UserCorpusId
  95                         -> GroupParams
  96                         -> (NgramsType, MapListSize)
  97                         -> m (Map NgramsType [NgramsElement])
  98 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
  99   allTerms  :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
 100
 101   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 102   socialLists :: FlowCont NgramsTerm FlowListScores
 103     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 104                                                       $ HashMap.fromList
 105                                                       $ List.zip (HashMap.keys allTerms)
 106                                                                  (List.cycle [mempty])
 107                                            )
 108   let
 109     groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
 110
 111     (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 112                            $ view flc_scores groupedWithList
 113
 114     (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
 115
 116     listSize = mapListSize - (List.length mapTerms)
 117     (mapTerms', candiTerms) = both HashMap.fromList
 118                             $ List.splitAt listSize
 119                             $ List.sortOn (Down . viewScore . snd)
 120                             $ HashMap.toList tailTerms'
 121
 122
 123   pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
 124                           <> (toNgramsElement mapTerms )
 125                           <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
 126                           <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
 127                           )]
 128
 129
 130 getGroupParams :: ( HasNodeError err
 131                   , CmdM     env err m
 132                   , HasNodeStory env err m
 133                   , HasTreeError err
 134                   )
 135                => GroupParams -> HashSet Ngrams -> m GroupParams
 136 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
 137   hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
 138   -- printDebug "hashMap" hashMap
 139   pure $ over gwl_map (\x -> x <> hashMap) gp
 140 getGroupParams gp _ = pure gp
 141
 142
 143 -- TODO use ListIds
 144 buildNgramsTermsList :: ( HasNodeError err
 145                         , CmdM     env err m
 146                         , HasNodeStory env err m
 147                         , HasTreeError err
 148                         )
 149                         => User
 150                         -> UserCorpusId
 151                         -> MasterCorpusId
 152                         -> GroupParams
 153                         -> (NgramsType, MapListSize)
 154                         -> m (Map NgramsType [NgramsElement])
 155 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
 156
 157 -- Filter 0 With Double
 158 -- Computing global speGen score
 159   allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
 160
 161   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 162   socialLists :: FlowCont NgramsTerm FlowListScores
 163     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 164                                                       $ HashMap.fromList
 165                                                       $ List.zip (HashMap.keys   allTerms)
 166                                                                  (List.cycle     [mempty])
 167                                            )
 168   let ngramsKeys = HashMap.keysSet allTerms
 169
 170   groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
 171
 172   let
 173     socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
 174   --printDebug "socialLists_Stemmed" socialLists_Stemmed
 175     groupedWithList = toGroupedTree socialLists_Stemmed allTerms
 176     (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 177                                 $ HashMap.filter (\g -> (view gts'_score g) > 1)
 178                                 $ view flc_scores groupedWithList
 179
 180     (groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
 181
 182   -- printDebug "stopTerms" stopTerms
 183
 184   -- splitting monterms and multiterms to take proportional candidates
 185     -- use % of list if to big, or Int if too small
 186     listSizeGlobal = 2000 :: Double
 187     monoSize = 0.4  :: Double
 188     multSize = 1 - monoSize
 189
 190     splitAt n' ns = both (HashMap.fromListWith (<>))
 191                   $ List.splitAt (round $ n' * listSizeGlobal)
 192                   $ List.sortOn (viewScore . snd)
 193                   $ HashMap.toList ns
 194
 195     (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
 196     (groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
 197
 198 -------------------------
 199 -- Filter 1 With Set NodeId and SpeGen
 200     selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
 201
 202
 203  -- TODO remove (and remove HasNodeError instance)
 204   userListId    <- defaultList uCid
 205   masterListId  <- defaultList mCid
 206
 207   mapTextDocIds <- getNodesByNgramsOnlyUser uCid
 208                                             [userListId, masterListId]
 209                                             nt
 210                                             selectedTerms
 211
 212   let
 213     groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
 214     groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
 215                                 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
 216
 217
 218   printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
 219
 220   -- Coocurrences computation
 221   --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
 222   let mapCooc = HashMap.filter (>1) -- removing cooc of 1
 223               $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 224                            | (t1, s1) <- mapStemNodeIds
 225                            , (t2, s2) <- mapStemNodeIds
 226                            ]
 227           where
 228             mapStemNodeIds = HashMap.toList
 229                            $ HashMap.map viewScores
 230                            $ groupedTreeScores_SetNodeId
 231   let
 232     -- computing scores
 233     mapScores f = HashMap.fromList
 234                 $ map (\g -> (view scored_terms g, f g))
 235                 $ normalizeGlobal
 236                 $ map normalizeLocal
 237                 $ scored'
 238                 $ Map.fromList -- TODO remove this
 239                 $ HashMap.toList mapCooc
 240
 241   let
 242     groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
 243     groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
 244
 245   let
 246     -- sort / partition / split
 247     -- filter mono/multi again
 248     (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
 249
 250       -- filter with max score
 251     partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
 252                                                    > (view scored_speExc $ view gts'_score g)
 253                                               )
 254
 255     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 256     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 257
 258   -- splitAt
 259   let
 260     -- use % of list if to big, or Int if to small
 261     mapSize = 1000 :: Double
 262     canSize = mapSize * 5 :: Double
 263
 264     inclSize = 0.4  :: Double
 265     exclSize = 1 - inclSize
 266
 267     splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
 268     sortOn   f  = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
 269
 270     monoInc_size n = splitAt' n $ monoSize * inclSize / 2
 271     multExc_size n = splitAt' n $ multSize * exclSize / 2
 272
 273
 274     (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
 275     (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
 276
 277     (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
 278     (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
 279
 280
 281     (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
 282     (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
 283
 284     (canMulScoredInclHead, _)  = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
 285     (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
 286
 287 ------------------------------------------------------------
 288     -- Final Step building the Typed list
 289     -- Candidates Terms need to be filtered
 290   let
 291     maps = setListType (Just MapTerm)
 292         $  mapMonoScoredInclHead
 293         <> mapMonoScoredExclHead
 294         <> mapMultScoredInclHead
 295         <> mapMultScoredExclHead
 296
 297     -- An original way to filter to start with
 298     cands = setListType (Just CandidateTerm)
 299           $ canMonoScoredIncHead
 300           <> canMonoScoredExclHead
 301           <> canMulScoredInclHead
 302           <> canMultScoredExclHead
 303
 304   -- TODO count it too
 305     cands' = setListType (Just CandidateTerm)
 306           {-\$  groupedMonoTail
 307           <>-} groupedMultTail
 308
 309     -- Quick FIX
 310     candNgramsElement = List.take 5000
 311                       $ toNgramsElement cands <> toNgramsElement cands'
 312
 313     result = Map.unionsWith (<>)
 314        [ Map.fromList [( nt, toNgramsElement maps
 315                           <> toNgramsElement stopTerms
 316                           <> candNgramsElement
 317                       )]
 318        ]
 319
 320   pure result