{-| Module : Gargantext.Core.Text.Ngrams.Lists Description : Tools to build lists Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} module Gargantext.Core.Text.List where import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2)) import Data.HashMap.Strict (HashMap) import Data.HashSet (HashSet) import Data.Map (Map) import Data.Monoid (mempty) import Data.Ord (Down(..)) import Data.Set (Set) import Data.Tuple.Extra (both) import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..)) import Gargantext.Core.Text (size) import Gargantext.Core.Text.List.Group import Gargantext.Core.Text.List.Group.Prelude import Gargantext.Core.Text.List.Group.WithStem import Gargantext.Core.Text.List.Social import Gargantext.Core.Text.List.Social.Prelude import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser) import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Prelude (CmdM) import Gargantext.Database.Query.Table.Ngrams (text2ngrams) import Gargantext.Database.Query.Table.Node (defaultList) import Gargantext.Database.Query.Table.NgramsPostag (selectLems) import Gargantext.Database.Query.Table.Node.Error (HasNodeError()) import Gargantext.Database.Query.Tree.Error (HasTreeError) import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..)) import Gargantext.Prelude import qualified Data.HashMap.Strict as HashMap import qualified Data.List as List import qualified Data.Map as Map import qualified Data.Set as Set import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap import qualified Data.HashSet as HashSet {- -- TODO maybe useful for later isStopTerm :: StopSize -> Text -> Bool isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x) where isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c) -} -- | TODO improve grouping functions of Authors, Sources, Institutes.. buildNgramsLists :: ( RepoCmdM env err m , CmdM env err m , HasTreeError err , HasNodeError err ) => GroupParams -> User -> UserCorpusId -> MasterCorpusId -> m (Map NgramsType [NgramsElement]) buildNgramsLists gp user uCid mCid = do ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350) othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity) [ (Authors , MapListSize 9) , (Sources , MapListSize 9) , (Institutes, MapListSize 9) ] pure $ Map.unions $ [ngTerms] <> othersTerms data MapListSize = MapListSize { unMapListSize :: !Int } buildNgramsOthersList ::( HasNodeError err , CmdM env err m , RepoCmdM env err m , HasTreeError err ) => User -> UserCorpusId -> GroupParams -> (NgramsType, MapListSize) -> m (Map NgramsType [NgramsElement]) buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt -- | PrivateFirst for first developments since Public NodeMode is not implemented yet socialLists :: FlowCont NgramsTerm FlowListScores <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty $ HashMap.fromList $ List.zip (HashMap.keys allTerms) (List.cycle [mempty]) ) {- if nt == Sources -- Authors then printDebug "flowSocialList" socialLists else printDebug "flowSocialList" "" -} let groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms {- if nt == Sources -- Authors then printDebug "groupedWithList" groupedWithList else printDebug "groupedWithList" "" -} let (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms listSize = mapListSize - (List.length mapTerms) (mapTerms', candiTerms) = both HashMap.fromList $ List.splitAt listSize $ List.sortOn (Down . viewScore . snd) $ HashMap.toList tailTerms' pure $ Map.fromList [( nt, (toNgramsElement stopTerms) <> (toNgramsElement mapTerms ) <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' ) <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms) )] getGroupParams :: ( HasNodeError err , CmdM env err m , RepoCmdM env err m , HasTreeError err ) => GroupParams -> HashSet Ngrams -> m GroupParams getGroupParams gp@(GroupWithPosTag l a _m) ng = do hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng) printDebug "hashMap" hashMap pure $ over gwl_map (\x -> x <> hashMap) gp getGroupParams gp _ = pure gp -- TODO use ListIds buildNgramsTermsList :: ( HasNodeError err , CmdM env err m , RepoCmdM env err m , HasTreeError err ) => User -> UserCorpusId -> MasterCorpusId -> GroupParams -> (NgramsType, MapListSize) -> m (Map NgramsType [NgramsElement]) buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do -- | Filter 0 With Double -- Computing global speGen score allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt -- | PrivateFirst for first developments since Public NodeMode is not implemented yet socialLists :: FlowCont NgramsTerm FlowListScores <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty $ HashMap.fromList $ List.zip (HashMap.keys allTerms) (List.cycle [mempty]) ) let ngramsKeys = HashMap.keysSet allTerms groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys) let socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists --printDebug "socialLists_Stemmed" socialLists_Stemmed let groupedWithList = toGroupedTree socialLists_Stemmed allTerms (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms -- printDebug "stopTerms" stopTerms -- splitting monterms and multiterms to take proportional candidates let -- use % of list if to big, or Int if too small listSizeGlobal = 2000 :: Double monoSize = 0.4 :: Double multSize = 1 - monoSize splitAt n' ns = both (HashMap.fromListWith (<>)) $ List.splitAt (round $ n' * listSizeGlobal) $ List.sortOn (viewScore . snd) $ HashMap.toList ns (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult ------------------------- -- Filter 1 With Set NodeId and SpeGen selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead) -- TO remove (and remove HasNodeError instance) userListId <- defaultList uCid masterListId <- defaultList mCid mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] nt selectedTerms let groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId)) groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead) -- | Coocurrences computation --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric let mapCooc = HashMap.filter (>2) $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2) | (t1, s1) <- mapStemNodeIds , (t2, s2) <- mapStemNodeIds ] where mapStemNodeIds = HashMap.toList $ HashMap.map viewScores $ groupedTreeScores_SetNodeId let -- computing scores mapScores f = HashMap.fromList $ map (\g -> (view scored_terms g, f g)) $ normalizeGlobal $ map normalizeLocal $ scored' $ Map.fromList -- TODO remove this $ HashMap.toList mapCooc let groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm)) groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) ( groupedMonoHead <> groupedMultHead ) let -- sort / partition / split -- filter mono/multi again (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen -- filter with max score partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g) > (view scored_speExc $ view gts'_score g) ) (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored -- splitAt let -- use % of list if to big, or Int if to small listSizeLocal = 1000 :: Double inclSize = 0.4 :: Double exclSize = 1 - inclSize splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal)) sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList monoInc_size = splitAt' $ monoSize * inclSize / 2 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl multExc_size = splitAt' $ multSize * exclSize / 2 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl ------------------------------------------------------------ -- Final Step building the Typed list termListHead = maps <> cands where maps = setListType (Just MapTerm) $ monoScoredInclHead <> monoScoredExclHead <> multScoredInclHead <> multScoredExclHead cands = setListType (Just CandidateTerm) $ monoScoredInclTail <> monoScoredExclTail <> multScoredInclTail <> multScoredExclTail termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail) let result = Map.unionsWith (<>) [ Map.fromList [( nt, toNgramsElement termListHead <> toNgramsElement termListTail <> toNgramsElement stopTerms )] ] -- printDebug "result" result pure result