{-| Module : Gargantext.Text.Ngrams.Lists Description : Tools to build lists Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} module Gargantext.Text.List where import Data.Either (partitionEithers, Either(..)) import Debug.Trace (trace) import Data.Map (Map) import Data.Set (Set) import Data.Text (Text) import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList) import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..)) import Gargantext.Core (Lang(..)) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId) import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith) import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Utils (Cmd) import Gargantext.Text.List.Learn (Model(..)) import Gargantext.Text.Metrics (takeScored) import Gargantext.Prelude --import Gargantext.Text.Terms (TermType(..)) import qualified Data.Char as Char import qualified Data.List as List import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.Text as Text data NgramsListBuilder = BuilderStepO { stemSize :: Int , stemX :: Int , stopSize :: Int } | BuilderStep1 { withModel :: Model } | BuilderStepN { withModel :: Model } | Tficf { nlb_lang :: Lang , nlb_group1 :: Int , nlb_group2 :: Int , nlb_stopSize :: StopSize , nlb_userCorpusId :: UserCorpusId , nlb_masterCorpusId :: MasterCorpusId } data StopSize = StopSize {unStopSize :: Int} -- | TODO improve grouping functions of Authors, Sources, Institutes.. buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId -> Cmd err (Map NgramsType [NgramsElement]) buildNgramsLists l n m s uCid mCid = do ngTerms <- buildNgramsTermsList l n m s uCid mCid --ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes] pure $ Map.unions $ othersTerms <> [ngTerms] buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType -> Cmd err (Map NgramsType [NgramsElement]) buildNgramsOthersList uCid groupIt nt = do ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt let all' = Map.toList ngs pure $ (toElements GraphTerm all') <> (toElements CandidateTerm all') --pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all') where toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList []) | (t,_ns) <- x ] ) ] --{- buildNgramsTermsList' :: UserCorpusId -> (Text -> Text) -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int -> Cmd err (Map NgramsType [NgramsElement]) --} buildNgramsTermsList' uCid groupIt stop gls is = do ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms let (stops, candidates) = partitionEithers $ map (\t -> if stop t then Left t else Right t) $ Map.toList $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs (maps, candidates') = takeScored gls is $ getCoocByNgrams' snd (Diagonal True) $ Map.fromList candidates toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t)) (s,c,m) = (stops , List.filter (\(k,_) -> List.elem k candidates') candidates , List.filter (\(k,_) -> List.elem k maps) candidates ) let ngs' = List.concat $ map toNgramsElement $ map (\t -> (StopTerm, toList' t)) s <> map (\t -> (CandidateTerm, toList' t)) c <> map (\t -> (GraphTerm, toList' t)) m pure $ Map.fromList [(NgramsTerms, ngs')] buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId -> Cmd err (Map NgramsType [NgramsElement]) buildNgramsTermsList l n m s uCid mCid = do candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m) let candidatesSize = 2000 a = 10 b = 10 candidatesHead = List.take candidatesSize candidates candidatesTail = List.drop candidatesSize candidates termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead) <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail) let ngs = List.concat $ map toNgramsElement termList pure $ Map.fromList [(NgramsTerms, ngs)] toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement] toNgramsElement (listType, (_stem, (_score, setNgrams))) = case Set.toList setNgrams of [] -> [] (parent:children) -> [parentElem] <> childrenElems where parentElem = mkNgramsElement parent listType Nothing (mSetFromList children) childrenElems = map (\t -> mkNgramsElement t listType (Just $ RootParent parent parent) (mSetFromList []) ) children toList :: (b -> Bool) -> ListType -> b -> (ListType, b) toList stop l n = case stop n of True -> (StopTerm, n) False -> (l, n) toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)] toTermList _ _ _ [] = [] toTermList a b stop ns = trace ("computing toTermList") $ map (toList stop CandidateTerm) xs <> map (toList stop GraphTerm) ys <> toTermList a b stop zs where xs = take a ns ta = drop a ns ys = take b ta zs = drop b ta isStopTerm :: StopSize -> Text -> Bool isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x) where isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)