{-| Module : Gargantext.Core.Text.Ngrams.Lists Description : Tools to build lists Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} module Gargantext.Core.Text.List where import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2)) import Data.Map (Map) import Data.Maybe (catMaybes) import Data.Monoid (mempty) import Data.Ord (Down(..)) import Data.Set (Set) import Data.Text (Text) import Data.Tuple.Extra (both) import Gargantext.API.Ngrams.Types (NgramsElement) import Gargantext.API.Ngrams.Types (RepoCmdM) import Gargantext.Core.Text (size) import Gargantext.Core.Text.List.Group import Gargantext.Core.Text.List.Group.Prelude import Gargantext.Core.Text.List.Group.WithStem import Gargantext.Core.Text.List.Social import Gargantext.Core.Text.List.Social.Prelude import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser) import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Prelude (CmdM) import Gargantext.Database.Query.Table.Node (defaultList) import Gargantext.Database.Query.Table.Node.Error (HasNodeError()) import Gargantext.Database.Query.Tree.Error (HasTreeError) import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Prelude import qualified Data.Char as Char import qualified Data.List as List import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.Text as Text -- | TODO improve grouping functions of Authors, Sources, Institutes.. buildNgramsLists :: ( RepoCmdM env err m , CmdM env err m , HasTreeError err , HasNodeError err ) => User -> GroupParams -> UserCorpusId -> MasterCorpusId -> m (Map NgramsType [NgramsElement]) buildNgramsLists user gp uCid mCid = do ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350) othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity) [ (Authors , MapListSize 9) , (Sources , MapListSize 9) , (Institutes, MapListSize 9) ] pure $ Map.unions $ [ngTerms] <> othersTerms data MapListSize = MapListSize { unMapListSize :: !Int } buildNgramsOthersList ::( HasNodeError err , CmdM env err m , RepoCmdM env err m , HasTreeError err ) => User -> UserCorpusId -> GroupParams -> (NgramsType, MapListSize) -> m (Map NgramsType [NgramsElement]) buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt -- | PrivateFirst for first developments since Public NodeMode is not implemented yet socialLists' :: FlowCont Text FlowListScores <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty $ Map.fromList $ List.zip (Map.keys allTerms) (List.cycle [mempty]) ) {- printDebug "flowSocialList'" $ Map.filter (not . ((==) Map.empty) . (view fls_parents)) $ view flc_scores socialLists' -} let groupedWithList = toGroupedTree groupParams socialLists' allTerms {- printDebug "groupedWithList" $ view flc_cont groupedWithList -} let (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms listSize = mapListSize - (List.length mapTerms) (mapTerms', candiTerms) = both Map.fromList $ List.splitAt listSize $ List.sortOn (Down . viewScore . snd) $ Map.toList tailTerms' pure $ Map.fromList [( nt, (toNgramsElement stopTerms) <> (toNgramsElement mapTerms ) <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' ) <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms) )] -- TODO use ListIds buildNgramsTermsList :: ( HasNodeError err , CmdM env err m , RepoCmdM env err m , HasTreeError err ) => User -> UserCorpusId -> MasterCorpusId -> GroupParams -> (NgramsType, MapListSize) -> m (Map NgramsType [NgramsElement]) buildNgramsTermsList user uCid mCid groupParams (nt, mapListSize)= do -- Computing global speGen score allTerms :: Map Text Double <- getTficf uCid mCid nt -- | PrivateFirst for first developments since Public NodeMode is not implemented yet socialLists' :: FlowCont Text FlowListScores <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty $ Map.fromList $ List.zip (Map.keys allTerms) (List.cycle [mempty]) ) {- printDebug "flowSocialList'" $ Map.filter (not . ((==) Map.empty) . (view fls_parents)) $ view flc_scores socialLists' -} let groupedWithList = toGroupedTree groupParams socialLists' allTerms {- -- TODO remove -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- -- First remove stops terms socialLists <- flowSocialList user nt (Set.fromList $ map fst $ Map.toList allTerms) -- Grouping the ngrams and keeping the maximum score for label let grouped = groupedTextWithStem ( GroupedTextParams (groupWith groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms groupedWithList = map (addListType (invertForw socialLists)) grouped -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- -} (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList -- (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms -- printDebug "\n * stopTerms * \n" stopTerms -- splitting monterms and multiterms to take proportional candidates let listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small monoSize = 0.4 :: Double multSize = 1 - monoSize splitAt n' ns = both (Map.fromListWith (<>)) $ List.splitAt (round $ n' * listSizeGlobal) $ List.sortOn (viewScore . snd) $ Map.toList ns (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult -- printDebug "groupedMonoHead" (List.length groupedMonoHead) -- printDebug "groupedMonoTail" (List.length groupedMonoHead) -- printDebug "groupedMultHead" (List.length groupedMultHead) -- printDebug "groupedMultTail" (List.length groupedMultTail) {- let -- Get Local Scores now for selected grouped ngrams -- TODO HasTerms selectedTerms = Set.toList $ List.foldl' (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set' $ Set.insert l' g ) Set.empty (groupedMonoHead <> groupedMultHead) -} selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead) -- TO remove (and remove HasNodeError instance) userListId <- defaultList uCid masterListId <- defaultList mCid mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] nt selectedTerms -- TODO let groupedTreeScores_SetNodeId = setScoresWith mapTextDocIds (groupedMonoHead <> groupedMultHead) {- -- TODO remove -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- let mapGroups = Map.fromList $ map (\g -> (g ^. gt_stem, g)) $ groupedMonoHead <> groupedMultHead -- grouping with Set NodeId contextsAdded = foldl' (\mapGroups' k -> let k' = groupWith groupParams k in case Map.lookup k' mapGroups' of Nothing -> mapGroups' Just g -> case Map.lookup k mapTextDocIds of Nothing -> mapGroups' Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups' ) mapGroups $ Map.keys mapTextDocIds -} -- | Coocurrences computation --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric let mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2) | (t1, s1) <- mapStemNodeIds , (t2, s2) <- mapStemNodeIds ] where mapStemNodeIds = Map.toList $ Map.map viewScores $ groupedTreeScores_SetNodeId let -- computing scores mapScores f = Map.fromList $ map (\s@(Scored t g s') -> (t, f s)) $ normalizeGlobal $ map normalizeLocal $ scored' mapCooc let -- groupedTreeScores_SpeGen :: GroupedTreeScores (Scored Double) groupedTreeScores_SpeGen = setScoresWith (mapScores identity) (groupedMonoHead <> groupedMultHead) {- -- TODO remove -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- groupsWithScores = catMaybes $ map (\(stem, g) -> case Map.lookup stem mapScores' of Nothing -> Nothing Just s' -> set gts'_score s' g ) $ Map.toList $ view flc_scores contextsAdded where mapScores' = mapScores identity -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- -- TODO remove -} -- adapt2 TOCHECK with DC -- printDebug "groupsWithScores" groupsWithScores -- 8<-- 8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<--8<-- -- TODO remove --} let -- sort / partition / split -- filter mono/multi again (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen -- (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores -- filter with max score -- partitionWithMaxScore = List.partition (\g -> let (s1,s2) = viewScore g in s1 > s2 ) partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g) > (view scored_speExc $ view gts'_score g) ) (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored -- splitAt let listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small inclSize = 0.4 :: Double exclSize = 1 - inclSize --splitAt' n' = (both (Map.fromListWith (<>))) . (List.splitAt (round $ n' * listSizeLocal)) splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal)) sortOn f = (List.sortOn (Down . f . _gts'_score . snd)) . Map.toList --sortOn f = (List.sortOn (Down . (gts'_score))) . Map.toList -- sort = (List.sortOn (Down . viewScore)) (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ (sortOn _scored_genInc) monoScoredIncl (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ (sortOn _scored_speExc) monoScoredExcl (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ (sortOn _scored_genInc) multScoredIncl (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ (sortOn _scored_speExc) multScoredExcl -- Final Step building the Typed list termListHead = maps <> cands where maps = setListType (Just MapTerm) $ monoScoredInclHead <> monoScoredExclHead <> multScoredInclHead <> multScoredExclHead cands = setListType (Just CandidateTerm) $ monoScoredInclTail <> monoScoredExclTail <> multScoredInclTail <> multScoredExclTail termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail) -- printDebug "monoScoredInclHead" monoScoredInclHead -- printDebug "monoScoredExclHead" monoScoredExclTail -- printDebug "multScoredInclHead" multScoredInclHead -- printDebug "multScoredExclTail" multScoredExclTail let result = Map.unionsWith (<>) [ Map.fromList [( nt, toNgramsElement termListHead <> toNgramsElement termListTail <> toNgramsElement stopTerms )] ] -- printDebug "\n result \n" r pure result toGargList :: Maybe ListType -> b -> (Maybe ListType, b) toGargList l n = (l,n) isStopTerm :: StopSize -> Text -> Bool isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x) where isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c) ------------------------------------------------------------------------------