{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
- ( FlowCmdM
+ ( DataText(..)
, getDataText
, flowDataText
, flow
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
+import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.Core (Lang(..), PosTagAlgo(..))
import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
-import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Core.Text.List (buildNgramsLists)
+import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
+import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Types (POS(NP))
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
+import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude
+import Gargantext.Database.Query.Table.ContextNodeNgrams2
import Gargantext.Database.Query.Table.Ngrams
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
-import Gargantext.Database.Query.Table.NodeNodeNgrams2
import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
import Gargantext.Database.Types
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
------------------------------------------------------------------------
--- Impots for upgrade function
+-- Imports for upgrade function
import Gargantext.Database.Query.Tree.Root (getRootId)
import Gargantext.Database.Query.Tree (findNodesId)
import qualified Data.List as List
-> DataText
-> TermType Lang
-> CorpusId
+ -> Maybe FlowSocialListWith
+ -> (JobLog -> m ())
-> m CorpusId
-flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
+flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
where
corpusType = (Nothing :: Maybe HyperdataCorpus)
-flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
+flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
------------------------------------------------------------------------
-- TODO use proxy
-> Either CorpusName [CorpusId]
-> (TermType Lang)
-> FilePath
+ -> (JobLog -> m ())
-> m AnnuaireId
-flowAnnuaire u n l filePath = do
+flowAnnuaire u n l filePath logStatus = do
docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
- flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
+ flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
------------------------------------------------------------------------
flowCorpusFile :: (FlowCmdM env err m)
-> Either CorpusName [CorpusId]
-> Limit -- Limit the number of docs (for dev purpose)
-> TermType Lang -> FileFormat -> FilePath
+ -> Maybe FlowSocialListWith
+ -> (JobLog -> m ())
-> m CorpusId
-flowCorpusFile u n l la ff fp = do
+flowCorpusFile u n l la ff fp mfslw logStatus = do
eParsed <- liftBase $ parseFile ff fp
case eParsed of
Right parsed -> do
let docs = splitEvery 500 $ take l parsed
- flowCorpus u n la (map (map toHyperdataDocument) docs)
+ flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
Left e -> panic $ "Error: " <> (T.pack e)
------------------------------------------------------------------------
=> User
-> Either CorpusName [CorpusId]
-> TermType Lang
+ -> Maybe FlowSocialListWith
-> [[a]]
+ -> (JobLog -> m ())
-> m CorpusId
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
-> User
-> Either CorpusName [CorpusId]
-> TermType Lang
+ -> Maybe FlowSocialListWith
-> [[a]]
+ -> (JobLog -> m ())
-> m CorpusId
-flow c u cn la docs = do
+flow c u cn la mfslw docs logStatus = do
-- TODO if public insertMasterDocs else insertUserDocs
- ids <- traverse (insertMasterDocs c la) docs
- flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
+ ids <- traverse (\(idx, doc) -> do
+ id <- insertMasterDocs c la doc
+ logStatus JobLog { _scst_succeeded = Just $ 1 + idx
+ , _scst_failed = Just 0
+ , _scst_remaining = Just $ length docs - idx
+ , _scst_events = Just []
+ }
+ pure id
+ ) (zip [1..] docs)
+ flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
+
+
+
------------------------------------------------------------------------
flowCorpusUser :: ( FlowCmdM env err m
-> Either CorpusName [CorpusId]
-> Maybe c
-> [NodeId]
+ -> Maybe FlowSocialListWith
-> m CorpusId
-flowCorpusUser l user corpusName ctype ids = do
+flowCorpusUser l user corpusName ctype ids mfslw = do
-- User Flow
(userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
-- NodeTexts is first
--let gp = (GroupParams l 2 3 (StopSize 3))
let gp = GroupWithPosTag l CoreNLP HashMap.empty
- ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
+ ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
+
+ -- printDebug "flowCorpusUser:ngs" ngs
_userListId <- flowList_DbRepo listId ngs
_mastListId <- getOrMkList masterCorpusId masterUserId
--_ <- mkPhylo userCorpusId userId
-- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId
+ _ <- updateNgramsOccurrences userCorpusId (Just listId)
+
pure userCorpusId
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
- let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
+ printDebug "terms2id" terms2id
- -- to be removed
- let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
+ let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new
mapCgramsId <- listInsertDb lId toNodeNgramsW'
$ map (first _ngramsTerms . second Map.keys)
$ HashMap.toList mapNgramsDocs
+ printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams
- _return <- insertNodeNodeNgrams2
- $ catMaybes [ NodeNodeNgrams2 <$> Just nId
- <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
- <*> Just (fromIntegral w :: Double)
+ _return <- insertContextNodeNgrams2
+ $ catMaybes [ ContextNodeNgrams2 <$> Just nId
+ <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
+ <*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
]
+
-- to be removed
- _ <- insertDocNgrams lId indexedNgrams
+ _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
pure ()
=> UserId
-> CorpusId
-> [a]
- -> m ([DocId], [Indexed NodeId a])
+ -> m ([ContextId], [Indexed ContextId a])
insertDocs uId cId hs = do
let docs = map addUniqId hs
newIds <- insertDb uId cId docs
-- | TODO putelsewhere
-- | Upgrade function
-- Suppose all documents are English (this is the case actually)
-indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
+indexAllDocumentsWithPosTag :: FlowCmdM env err m
+ => m ()
indexAllDocumentsWithPosTag = do
rootId <- getRootId (UserName userMaster)
corpusIds <- findNodesId rootId [NodeCorpus]
docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
-
_ <- mapM extractInsert (splitEvery 1000 docs)
-
pure ()
-extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
+extractInsert :: FlowCmdM env err m
+ => [Node HyperdataDocument] -> m ()
extractInsert docs = do
let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
-
mapNgramsDocs' <- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT $ withLang (Multi EN) documentsWithId)
documentsWithId
-
_ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
-
pure ()