module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( DataText(..)
, getDataText
+ , getDataText_Debug
, flowDataText
, flow
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
+import qualified Data.Conduit.List as CL
+import qualified Data.Conduit as C
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.Core (Lang(..), PosTagAlgo(..))
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
-import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
+import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Prelude.Crypto.Hash (Hash)
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
-import qualified Prelude as Prelude
+import qualified Prelude
------------------------------------------------------------------------
-- Imports for upgrade function
| DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
-- | DataNew ![[HyperdataDocument]]
+-- Show instance is not possible because of IO
+printDataText :: DataText -> IO ()
+printDataText (DataOld xs) = putStrLn $ show xs
+printDataText (DataNew (maybeInt, conduitData)) = do
+ res <- C.runConduit (conduitData .| CL.consume)
+ putStrLn $ show (maybeInt, res)
+
-- TODO use the split parameter in config file
getDataText :: FlowCmdM env err m
=> DataOrigin
ids <- map fst <$> searchDocInDatabase cId (stemIt q)
pure $ Right $ DataOld ids
+getDataText_Debug :: FlowCmdM env err m
+ => DataOrigin
+ -> TermType Lang
+ -> API.Query
+ -> Maybe API.Limit
+ -> m ()
+getDataText_Debug a l q li = do
+ result <- getDataText a l q li
+ case result of
+ Left err -> liftBase $ putStrLn $ show err
+ Right res -> liftBase $ printDataText res
+
+
-------------------------------------------------------------------------------
flowDataText :: forall env err m.
( FlowCmdM env err m
=> User
-> Either CorpusName [CorpusId]
-> Limit -- Limit the number of docs (for dev purpose)
- -> TermType Lang -> FileFormat -> FilePath
+ -> TermType Lang
+ -> FileType
+ -> FileFormat
+ -> FilePath
-> Maybe FlowSocialListWith
-> (JobLog -> m ())
-> m CorpusId
-flowCorpusFile u n _l la ff fp mfslw logStatus = do
- eParsed <- liftBase $ parseFile ff fp
+flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
+ eParsed <- liftBase $ parseFile ft ff fp
case eParsed of
Right parsed -> do
flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
--let docs = splitEvery 500 $ take l parsed
--flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
- Left e -> panic $ "Error: " <> (T.pack e)
+ Left e -> panic $ "Error: " <> T.pack e
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
-> m CorpusId
flow c u cn la mfslw (mLength, docsC) logStatus = do
-- TODO if public insertMasterDocs else insertUserDocs
- ids <- runConduit $
- zipSources (yieldMany [1..]) docsC
- .| mapMC insertDoc
- .| sinkList
+ ids <- runConduit $ zipSources (yieldMany [1..]) docsC
+ .| mapMC insertDoc
+ .| sinkList
-- ids <- traverse (\(idx, doc) -> do
-- id <- insertMasterDocs c la doc
-- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
(masterUserId, _masterRootId, masterCorpusId)
<- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
- --let gp = (GroupParams l 2 3 (StopSize 3))
+ --let gp = (GroupParams l 2 3 (StopSize 3))
+ -- Here the PosTagAlgo should be chosen according the Lang
let gp = GroupWithPosTag l CoreNLP HashMap.empty
ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
-> m [DocId]
insertMasterDocs c lang hs = do
(masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
- (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
+ (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
_ <- Doc.add masterCorpusId ids'
-- TODO
-- create a corpus with database name (CSV or PubMed)
-> m ([ContextId], [Indexed ContextId a])
insertDocs uId cId hs = do
let docs = map addUniqId hs
- newIds <- insertDb uId cId docs
+ newIds <- insertDb uId Nothing docs
-- printDebug "newIds" newIds
let
newIds' = map reId newIds