-- TODO-EVENTS: InsertedNodes
-}
-{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE ConstraintKinds #-}
-{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE InstanceSigs #-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( FlowCmdM
, getDataText
, flowDataText
+ , flow
, flowCorpusFile
, flowCorpus
, flowAnnuaire
+ , insertMasterDocs
, getOrMkRoot
, getOrMk_RootWithCorpus
, DataOrigin(..)
, allDataOrigins
--- To remove maybe
- , tt_lang
- , tt_ngramsSize
- , tt_windowSize
, do_api
)
where
import Data.Aeson.TH (deriveJSON)
import Data.Either
import Data.List (concat)
+import qualified Data.Map as Map
import Data.Map (Map, lookup)
-import Data.Maybe (Maybe(..), catMaybes)
+import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
import Data.Text (splitOn, intercalate)
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
-import Debug.Trace (trace)
+import GHC.Generics (Generic)
+import System.FilePath (FilePath)
+
import Gargantext.Core (Lang(..))
+import Gargantext.Core.Ext.IMT (toSchoolName)
+import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Core.Flow.Types
+import Gargantext.Core.Text
+import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
+import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
+import Gargantext.Core.Text.List (buildNgramsLists)
+import Gargantext.Core.Text.Terms
+import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Types (Terms(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
+import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
-import Gargantext.Database.Action.Query.Node
-import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
-import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
-import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
-import Gargantext.Database.Action.Search (searchInDatabase)
+import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
-import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
+import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
-import Gargantext.Database.Admin.Utils (Cmd)
-import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
-import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
-import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
-import Gargantext.Ext.IMT (toSchoolName)
-import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
-import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
+import Gargantext.Database.Prelude
+import Gargantext.Database.Query.Table.Ngrams
+import Gargantext.Database.Query.Table.Node
+import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
+import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
+import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
+import Gargantext.Database.Query.Table.NodeNodeNgrams2
+import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
+import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude
-import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
-import Gargantext.Text.List (buildNgramsLists,StopSize(..))
-import qualified Gargantext.Text.Terms as GTT (TermType(..), tt_lang, extractTerms, uniText)
-import Gargantext.Text.Terms.Eleve (buildTries, toToken)
-import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
-import GHC.Generics (Generic)
-import Prelude (String)
-import System.FilePath (FilePath)
-import qualified Data.List as List
-import qualified Data.Map as Map
-import qualified Data.Text as Text
-import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
-import qualified Gargantext.Text.Corpus.API as API
+import Gargantext.Prelude.Crypto.Hash (Hash)
+import qualified Gargantext.Core.Text.Corpus.API as API
+import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
-data DataOrigin = Internal { _do_api :: API.ExternalAPIs }
- | External { _do_api :: API.ExternalAPIs }
+data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
+ | ExternalOrigin { _do_api :: API.ExternalAPIs }
-- TODO Web
deriving (Generic, Eq)
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
allDataOrigins :: [DataOrigin]
-allDataOrigins = map Internal API.externalAPIs <> map External API.externalAPIs
+allDataOrigins = map InternalOrigin API.externalAPIs
+ <> map ExternalOrigin API.externalAPIs
---------------
-
data DataText = DataOld ![NodeId]
| DataNew ![[HyperdataDocument]]
-
-- TODO use the split parameter in config file
getDataText :: FlowCmdM env err m
=> DataOrigin
-> API.Query
-> Maybe API.Limit
-> m DataText
-getDataText (External api) la q li = liftBase $ DataNew
+getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
<$> splitEvery 500
<$> API.get api (_tt_lang la) q li
-getDataText (Internal _) _la q _li = do
+getDataText (InternalOrigin _) _la q _li = do
(_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
(UserName userMaster)
(Left "")
(Nothing :: Maybe HyperdataCorpus)
- ids <- map fst <$> searchInDatabase cId (stemIt q)
+ ids <- map fst <$> searchDocInDatabase cId (stemIt q)
pure $ DataOld ids
-------------------------------------------------------------------------------
--- API for termType
-data TermType lang
- = Mono { _tt_lang :: lang }
- | Multi { _tt_lang :: lang }
- | MonoMulti { _tt_lang :: lang }
- | Unsupervised { _tt_lang :: lang
- , _tt_windowSize :: Int
- , _tt_ngramsSize :: Int
- }
- deriving Generic
-
--- | GTT.TermType as a complex type in Unsupervised configuration that is not needed
--- for the API use
-tta2tt :: TermType lang -> GTT.TermType lang
-tta2tt (Mono l) = GTT.Mono l
-tta2tt (Multi l) = GTT.Multi l
-tta2tt (MonoMulti l) = GTT.MonoMulti l
-tta2tt (Unsupervised la w ng) = GTT.Unsupervised la w ng Nothing
-
-makeLenses ''TermType
-deriveJSON (unPrefix "_tt_") ''TermType
-
-instance (ToSchema a) => ToSchema (TermType a) where
- declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_tt_")
-
-
-flowDataText :: FlowCmdM env err m
- => User
- -> DataText
- -> TermType Lang
- -> CorpusId
- -> m CorpusId
+flowDataText :: ( FlowCmdM env err m
+ )
+ => User
+ -> DataText
+ -> TermType Lang
+ -> CorpusId
+ -> m CorpusId
flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
where
corpusType = (Nothing :: Maybe HyperdataCorpus)
------------------------------------------------------------------------
-- TODO use proxy
-flowAnnuaire :: FlowCmdM env err m
+flowAnnuaire :: (FlowCmdM env err m)
=> User
-> Either CorpusName [CorpusId]
-> (TermType Lang)
flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
------------------------------------------------------------------------
-flowCorpusFile :: FlowCmdM env err m
+flowCorpusFile :: (FlowCmdM env err m)
=> User
-> Either CorpusName [CorpusId]
-> Limit -- Limit the number of docs (for dev purpose)
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
-flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
- => Maybe c
- -> User
- -> Either CorpusName [CorpusId]
- -> TermType Lang
- -> [[a]]
- -> m CorpusId
+flow :: ( FlowCmdM env err m
+ , FlowCorpus a
+ , MkCorpus c
+ )
+ => Maybe c
+ -> User
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> [[a]]
+ -> m CorpusId
flow c u cn la docs = do
- let la' = tta2tt la
- ids <- traverse (insertMasterDocs c la') docs
- flowCorpusUser (la' ^. GTT.tt_lang) u cn c (concat ids)
+ -- TODO if public insertMasterDocs else insertUserDocs
+ ids <- traverse (insertMasterDocs c la) docs
+ flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
------------------------------------------------------------------------
-flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
+flowCorpusUser :: ( FlowCmdM env err m
+ , MkCorpus c
+ )
=> Lang
-> User
-> Either CorpusName [CorpusId]
flowCorpusUser l user corpusName ctype ids = do
-- User Flow
(userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
+ -- NodeTexts is first
+ _tId <- insertDefaultNode NodeTexts userCorpusId userId
+ -- printDebug "NodeTexts: " tId
+
+ -- NodeList is second
listId <- getOrMkList userCorpusId userId
- _cooc <- mkNode NodeListCooc listId userId
+ -- _cooc <- insertDefaultNode NodeListCooc listId userId
-- TODO: check if present already, ignore
_ <- Doc.add userCorpusId ids
- _tId <- mkNode NodeTexts userCorpusId userId
- -- printDebug "Node Text Id" tId
+ -- printDebug "Node Text Ids:" tId
-- User List Flow
(masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
- ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
+ ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
_userListId <- flowList_DbRepo listId ngs
_mastListId <- getOrMkList masterCorpusId masterUserId
-- _ <- insertOccsUpdates userCorpusId mastListId
-- printDebug "userListId" userListId
-- User Graph Flow
- _ <- mkDashboard userCorpusId userId
- _ <- mkGraph userCorpusId userId
+ _ <- insertDefaultNode NodeDashboard userCorpusId userId
+ _ <- insertDefaultNode NodeGraph userCorpusId userId
--_ <- mkPhylo userCorpusId userId
-
-- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId
pure userCorpusId
, MkCorpus c
)
=> Maybe c
- -> GTT.TermType Lang
+ -> TermType Lang
-> [a]
-> m [DocId]
insertMasterDocs c lang hs = do
(masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
-
- -- TODO Type NodeDocumentUnicised
- let docs = map addUniqId hs
- ids <- insertDb masterUserId masterCorpusId docs
- let
- ids' = map reId ids
- documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
+ (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
+ _ <- Doc.add masterCorpusId ids'
-- TODO
-- create a corpus with database name (CSV or PubMed)
-- add documents to the corpus (create node_node link)
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
- maps <- mapNodeIdNgrams
+ mapNgramsDocs <- mapNodeIdNgrams
<$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
- terms2id <- insertNgrams $ Map.keys maps
+ terms2id <- insertNgrams $ Map.keys mapNgramsDocs
-- to be removed
- let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
+ let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
-- new
lId <- getOrMkList masterCorpusId masterUserId
mapCgramsId <- listInsertDb lId toNodeNgramsW'
$ map (first _ngramsTerms . second Map.keys)
- $ Map.toList maps
+ $ Map.toList mapNgramsDocs
-- insertDocNgrams
_return <- insertNodeNodeNgrams2
$ catMaybes [ NodeNodeNgrams2 <$> Just nId
- <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
+ <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
- | (terms, mapNgramsTypes) <- Map.toList maps
+ | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
]
- _ <- Doc.add masterCorpusId ids'
- _cooc <- mkNode NodeListCooc lId masterUserId
+ -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
-- to be removed
_ <- insertDocNgrams lId indexedNgrams
-
pure ids'
+------------------------------------------------------------------------
+-- TODO Type NodeDocumentUnicised
+insertDocs :: ( FlowCmdM env err m
+ -- , FlowCorpus a
+ , FlowInsertDB a
+ )
+ => UserId
+ -> CorpusId
+ -> [a]
+ -> m ([DocId], [DocumentWithId a])
+insertDocs uId cId hs = do
+ let docs = map addUniqId hs
+ newIds <- insertDb uId cId docs
+ -- printDebug "newIds" newIds
+ let
+ newIds' = map reId newIds
+ documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
+ _ <- Doc.add cId newIds'
+ pure (newIds', documentsWithId)
-withLang :: HasText a
- => GTT.TermType Lang
- -> [DocumentWithId a]
- -> GTT.TermType Lang
-withLang (GTT.Unsupervised l n s m) ns = GTT.Unsupervised l n s m'
- where
- m' = case m of
- Nothing -> trace ("buildTries here" :: String)
- $ Just
- $ buildTries n ( fmap toToken $ GTT.uniText
- $ Text.intercalate " . "
- $ List.concat
- $ map hasText ns
- )
- just_m -> just_m
-withLang l _ = l
------------------------------------------------------------------------
viewUniqId' :: UniqId a
=> a
- -> (HashId, a)
+ -> (Hash, a)
viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
where
err = panic "[ERROR] Database.Flow.toInsert"
toInserted :: [ReturnId]
- -> Map HashId ReturnId
+ -> Map Hash ReturnId
toInserted =
- Map.fromList . map (\r -> (reUniqId r, r) )
+ Map.fromList . map (\r -> (reUniqId r, r) )
. filter (\r -> reInserted r == True)
-mergeData :: Map HashId ReturnId
- -> Map HashId a
+mergeData :: Map Hash ReturnId
+ -> Map Hash a
-> [DocumentWithId a]
mergeData rs = catMaybes . map toDocumentWithId . Map.toList
where
<*> Just hpd
------------------------------------------------------------------------
-
instance HasText HyperdataContact
where
hasText = undefined
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+documentIdWithNgrams :: HasNodeError err
+ => (a
+ -> Cmd err (Map Ngrams (Map NgramsType Int)))
+ -> [DocumentWithId a]
+ -> Cmd err [DocumentIdWithNgrams a]
+documentIdWithNgrams f = traverse toDocumentIdWithNgrams
+ where
+ toDocumentIdWithNgrams d = do
+ e <- f $ documentData d
+ pure $ DocumentIdWithNgrams d e
+------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
where
- extract :: GTT.TermType Lang -> HyperdataContact
+ extract :: TermType Lang -> HyperdataContact
-> Cmd err (Map Ngrams (Map NgramsType Int))
extract _l hc' = do
let authors = map text2ngrams
instance HasText HyperdataDocument
where
- hasText h = catMaybes [ _hyperdataDocument_title h
- , _hyperdataDocument_abstract h
+ hasText h = catMaybes [ _hd_title h
+ , _hd_abstract h
]
+
instance ExtractNgramsT HyperdataDocument
where
- extractNgramsT :: GTT.TermType Lang
+ extractNgramsT :: TermType Lang
-> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
where
- extractNgramsT' :: GTT.TermType Lang
+ extractNgramsT' :: TermType Lang
-> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
- $ _hyperdataDocument_source doc
+ $ _hd_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
- $ _hyperdataDocument_institutes doc
+ $ _hd_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
- $ _hyperdataDocument_authors doc
+ $ _hd_authors doc
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
- <$> liftBase (GTT.extractTerms lang' $ hasText doc)
+ <$> liftBase (extractTerms lang' $ hasText doc)
pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
-filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
- -> Map Ngrams (Map NgramsType Int)
-filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
+instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
- filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
- True -> (ng,y)
- False -> (Ngrams (Text.take s' t) n , y)
-
+ extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
-documentIdWithNgrams :: HasNodeError err
- => (a
- -> Cmd err (Map Ngrams (Map NgramsType Int)))
- -> [DocumentWithId a]
- -> Cmd err [DocumentIdWithNgrams a]
-documentIdWithNgrams f = traverse toDocumentIdWithNgrams
+instance HasText a => HasText (Node a)
where
- toDocumentIdWithNgrams d = do
- e <- f $ documentData d
- pure $ DocumentIdWithNgrams d e
+ hasText (Node _ _ _ _ _ _ _ h) = hasText h
+