-- TODO-EVENTS: InsertedNodes
-}
-{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
-{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
- ( FlowCmdM
+ ( DataText(..)
, getDataText
+ , getDataText_Debug
, flowDataText
, flow
, flowCorpusFile
, flowCorpus
+ , flowCorpusUser
, flowAnnuaire
+ , insertMasterDocs
+ , saveDocNgramsWith
, getOrMkRoot
, getOrMk_RootWithCorpus
, allDataOrigins
, do_api
+ , indexAllDocumentsWithPosTag
)
where
-import Control.Lens ((^.), view, _Just, makeLenses)
+import Conduit
+import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
+import Control.Monad.Reader (MonadReader)
import Data.Aeson.TH (deriveJSON)
+import Data.Conduit.Internal (zipSources)
+import qualified Data.Conduit.List as CList
import Data.Either
+import Data.HashMap.Strict (HashMap)
+import Data.Hashable (Hashable)
import Data.List (concat)
-import qualified Data.Map as Map
import Data.Map (Map, lookup)
-import Data.Maybe (Maybe(..), catMaybes)
+import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
-import Data.Text (splitOn, intercalate)
-import Data.Traversable (traverse)
+import qualified Data.Text as T
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
+import Servant.Client (ClientError)
import System.FilePath (FilePath)
+import qualified Data.HashMap.Strict as HashMap
+import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
+import qualified Data.Map as Map
+import qualified Data.Conduit.List as CL
+import qualified Data.Conduit as C
-import Gargantext.Core (Lang(..))
+import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
+import Gargantext.Core (Lang(..), PosTagAlgo(..))
+import Gargantext.Core.Ext.IMT (toSchoolName)
+import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types
-import Gargantext.Core.Types (Terms(..))
+import Gargantext.Core.Text
+import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
+import Gargantext.Core.Text.List (buildNgramsLists)
+import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
+import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
+import Gargantext.Core.Text.Terms
+import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
+import Gargantext.Core.Utils (addTuples)
+import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types
-import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
-import Gargantext.Database.Query.Table.Node
-import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
-import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
+import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
-import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
+import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude
+import Gargantext.Database.Query.Table.ContextNodeNgrams2
import Gargantext.Database.Query.Table.Ngrams
+import Gargantext.Database.Query.Table.Node
+import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
+import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
-import Gargantext.Database.Query.Table.NodeNodeNgrams2
-import Gargantext.Core.Ext.IMT (toSchoolName)
-import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
-import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
-import Gargantext.Core.Text
+import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
+import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
+import Gargantext.Database.Types
import Gargantext.Prelude
-import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
-import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
-import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
-import Gargantext.Core.Text.Terms
-import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
+import Gargantext.Prelude.Crypto.Hash (Hash)
import qualified Gargantext.Core.Text.Corpus.API as API
+import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
+--import qualified Prelude
+------------------------------------------------------------------------
+-- Imports for upgrade function
+import Gargantext.Database.Query.Tree.Root (getRootId)
+import Gargantext.Database.Query.Tree (findNodesId)
+import qualified Data.List as List
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
instance ToSchema DataOrigin where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
-allDataOrigins :: [DataOrigin]
-allDataOrigins = map InternalOrigin API.externalAPIs
- <> map ExternalOrigin API.externalAPIs
+allDataOrigins :: ( MonadReader env m
+ , HasConfig env) => m [DataOrigin]
+allDataOrigins = do
+ ext <- API.externalAPIs
+
+ pure $ map InternalOrigin ext
+ <> map ExternalOrigin ext
---------------
data DataText = DataOld ![NodeId]
- | DataNew ![[HyperdataDocument]]
+ | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
+ --- | DataNew ![[HyperdataDocument]]
+
+-- Show instance is not possible because of IO
+printDataText :: DataText -> IO ()
+printDataText (DataOld xs) = putStrLn $ show xs
+printDataText (DataNew (maybeInt, conduitData)) = do
+ res <- C.runConduit (conduitData .| CL.consume)
+ putStrLn $ show (maybeInt, res)
-- TODO use the split parameter in config file
getDataText :: FlowCmdM env err m
-> TermType Lang
-> API.Query
-> Maybe API.Limit
- -> m DataText
-getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
- <$> splitEvery 500
- <$> API.get api (_tt_lang la) q li
+ -> m (Either ClientError DataText)
+getDataText (ExternalOrigin api) la q li = liftBase $ do
+ eRes <- API.get api (_tt_lang la) q li
+ pure $ DataNew <$> eRes
+
getDataText (InternalOrigin _) _la q _li = do
(_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
(UserName userMaster)
(Left "")
(Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt q)
- pure $ DataOld ids
+ pure $ Right $ DataOld ids
+
+getDataText_Debug :: FlowCmdM env err m
+ => DataOrigin
+ -> TermType Lang
+ -> API.Query
+ -> Maybe API.Limit
+ -> m ()
+getDataText_Debug a l q li = do
+ result <- getDataText a l q li
+ case result of
+ Left err -> liftBase $ putStrLn $ show err
+ Right res -> liftBase $ printDataText res
+
-------------------------------------------------------------------------------
-flowDataText :: FlowCmdM env err m
- => User
- -> DataText
- -> TermType Lang
- -> CorpusId
- -> m CorpusId
-flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
+flowDataText :: forall env err m.
+ ( FlowCmdM env err m
+ )
+ => User
+ -> DataText
+ -> TermType Lang
+ -> CorpusId
+ -> Maybe FlowSocialListWith
+ -> (JobLog -> m ())
+ -> m CorpusId
+flowDataText u (DataOld ids) tt cid mfslw _ = do
+ (_userId, userCorpusId, listId) <- createNodes u (Right [cid]) corpusType
+ _ <- Doc.add userCorpusId ids
+ flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw
where
corpusType = (Nothing :: Maybe HyperdataCorpus)
-flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
+flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
+ flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
------------------------------------------------------------------------
-- TODO use proxy
-flowAnnuaire :: FlowCmdM env err m
+flowAnnuaire :: (FlowCmdM env err m)
=> User
-> Either CorpusName [CorpusId]
-> (TermType Lang)
-> FilePath
+ -> (JobLog -> m ())
-> m AnnuaireId
-flowAnnuaire u n l filePath = do
- docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
- flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
+flowAnnuaire u n l filePath logStatus = do
+ -- TODO Conduit for file
+ docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
+ flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
------------------------------------------------------------------------
-flowCorpusFile :: FlowCmdM env err m
+flowCorpusFile :: (FlowCmdM env err m)
=> User
-> Either CorpusName [CorpusId]
-> Limit -- Limit the number of docs (for dev purpose)
- -> TermType Lang -> FileFormat -> FilePath
+ -> TermType Lang
+ -> FileType
+ -> FileFormat
+ -> FilePath
+ -> Maybe FlowSocialListWith
+ -> (JobLog -> m ())
-> m CorpusId
-flowCorpusFile u n l la ff fp = do
- docs <- liftBase ( splitEvery 500
- <$> take l
- <$> parseFile ff fp
- )
- flowCorpus u n la (map (map toHyperdataDocument) docs)
+flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
+ eParsed <- liftBase $ parseFile ft ff fp
+ case eParsed of
+ Right parsed -> do
+ flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
+ --let docs = splitEvery 500 $ take l parsed
+ --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
+ Left e -> panic $ "Error: " <> T.pack e
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
=> User
-> Either CorpusName [CorpusId]
-> TermType Lang
- -> [[a]]
+ -> Maybe FlowSocialListWith
+ -> (Maybe Integer, ConduitT () a m ())
+ -> (JobLog -> m ())
-> m CorpusId
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
-flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
- => Maybe c
- -> User
- -> Either CorpusName [CorpusId]
- -> TermType Lang
- -> [[a]]
- -> m CorpusId
-flow c u cn la docs = do
+flow :: forall env err m a c.
+ ( FlowCmdM env err m
+ , FlowCorpus a
+ , MkCorpus c
+ )
+ => Maybe c
+ -> User
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> Maybe FlowSocialListWith
+ -> (Maybe Integer, ConduitT () a m ())
+ -> (JobLog -> m ())
+ -> m CorpusId
+flow c u cn la mfslw (mLength, docsC) logStatus = do
+ (_userId, userCorpusId, listId) <- createNodes u cn c
-- TODO if public insertMasterDocs else insertUserDocs
- ids <- traverse (insertMasterDocs c la) docs
- flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
+ _ <- runConduit $ zipSources (yieldMany [1..]) docsC
+ .| CList.chunksOf 100
+ .| mapMC insertDocs'
+ .| mapM_C (\ids' -> do
+ _ <- Doc.add userCorpusId ids'
+ pure ())
+ .| sinkList
+
+ _ <- flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
+
+-- ids <- traverse (\(idx, doc) -> do
+-- id <- insertMasterDocs c la doc
+-- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
+-- , _scst_failed = Just 0
+-- , _scst_remaining = Just $ length docs - idx
+-- , _scst_events = Just []
+-- }
+-- pure id
+-- ) (zip [1..] docs)
+ --printDebug "[flow] calling flowCorpusUser" (0 :: Int)
+ pure userCorpusId
+ --flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
+
+ where
+ insertDocs' :: [(Integer, a)] -> m [NodeId]
+ insertDocs' [] = pure []
+ insertDocs' docs = do
+ printDebug "[flow] calling insertDoc, ([idx], mLength) = " (fst <$> docs, mLength)
+ ids <- insertMasterDocs c la (snd <$> docs)
+ let maxIdx = maximum (fst <$> docs)
+ case mLength of
+ Nothing -> pure ()
+ Just len -> do
+ logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + maxIdx
+ , _scst_failed = Just 0
+ , _scst_remaining = Just $ fromIntegral $ len - maxIdx
+ , _scst_events = Just []
+ }
+ pure ids
+
+
------------------------------------------------------------------------
-flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
- => Lang
- -> User
- -> Either CorpusName [CorpusId]
- -> Maybe c
- -> [NodeId]
- -> m CorpusId
-flowCorpusUser l user corpusName ctype ids = do
+createNodes :: ( FlowCmdM env err m
+ , MkCorpus c
+ )
+ => User
+ -> Either CorpusName [CorpusId]
+ -> Maybe c
+ -> m (UserId, CorpusId, ListId)
+createNodes user corpusName ctype = do
-- User Flow
(userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
+ -- NodeTexts is first
+ _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
+ -- printDebug "NodeTexts: " tId
+
+ -- NodeList is second
listId <- getOrMkList userCorpusId userId
- _cooc <- insertDefaultNode NodeListCooc listId userId
- -- TODO: check if present already, ignore
- _ <- Doc.add userCorpusId ids
- -- tId <- insertDefaultNode NodeTexts userCorpusId userId
- -- printDebug "Node Text Id" tId
+ -- User Graph Flow
+ _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
+ _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
+
+ pure (userId, userCorpusId, listId)
+
+flowCorpusUser :: ( FlowCmdM env err m
+ , MkCorpus c
+ )
+ => Lang
+ -> User
+ -> CorpusId
+ -> ListId
+ -> Maybe c
+ -> Maybe FlowSocialListWith
+ -> m CorpusId
+flowCorpusUser l user userCorpusId listId ctype mfslw = do
-- User List Flow
- (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
- ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
- _userListId <- flowList_DbRepo listId ngs
- _mastListId <- getOrMkList masterCorpusId masterUserId
+ (masterUserId, _masterRootId, masterCorpusId)
+ <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
+
+ --let gp = (GroupParams l 2 3 (StopSize 3))
+ -- Here the PosTagAlgo should be chosen according to the Lang
+ _ <- case mfslw of
+ (Just (NoList _)) -> do
+ printDebug "Do not build list" mfslw
+ pure ()
+ _ -> do
+ ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw
+ $ GroupWithPosTag l CoreNLP HashMap.empty
+
+ -- printDebug "flowCorpusUser:ngs" ngs
+
+ _userListId <- flowList_DbRepo listId ngs
+ _mastListId <- getOrMkList masterCorpusId masterUserId
+ pure ()
-- _ <- insertOccsUpdates userCorpusId mastListId
-- printDebug "userListId" userListId
- -- User Graph Flow
- _ <- insertDefaultNode NodeDashboard userCorpusId userId
- _ <- insertDefaultNode NodeGraph userCorpusId userId
--_ <- mkPhylo userCorpusId userId
-- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId
- pure userCorpusId
+ _ <- updateNgramsOccurrences userCorpusId (Just listId)
--- TODO Type NodeDocumentUnicised
-insertDocs :: ( FlowCmdM env err m
- , FlowCorpus a
- )
- => [a]
- -> UserId
- -> CorpusId
- -> m ([DocId], [DocumentWithId a])
-insertDocs hs uId cId = do
- printDebug "hs" (length hs)
- let docs = map addUniqId hs
- printDebug "docs" (length docs)
- ids <- insertDb uId cId docs
- printDebug "ids" (length ids)
- let
- ids' = map reId ids
- documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
- _ <- Doc.add cId ids'
- pure (ids', documentsWithId)
+ pure userCorpusId
insertMasterDocs :: ( FlowCmdM env err m
-> m [DocId]
insertMasterDocs c lang hs = do
(masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
- (ids', documentsWithId) <- insertDocs hs masterUserId masterCorpusId
-
+ (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
+ _ <- Doc.add masterCorpusId ids'
-- TODO
-- create a corpus with database name (CSV or PubMed)
-- add documents to the corpus (create node_node link)
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
- mapNgramsDocs <- mapNodeIdNgrams
- <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
+ mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
+ <- mapNodeIdNgrams
+ <$> documentIdWithNgrams
+ (extractNgramsT $ withLang lang documentsWithId)
+ documentsWithId
- terms2id <- insertNgrams $ Map.keys mapNgramsDocs
- -- to be removed
- let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
+ lId <- getOrMkList masterCorpusId masterUserId
+ -- _ <- saveDocNgramsWith lId mapNgramsDocs'
+ _ <- saveDocNgramsWith lId mapNgramsDocs'
+
+ -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
+ pure ids'
+
+saveDocNgramsWith :: (FlowCmdM env err m)
+ => ListId
+ -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
+ -> m ()
+saveDocNgramsWith lId mapNgramsDocs' = do
+ --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
+ let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
+ terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
+
+ let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new
- lId <- getOrMkList masterCorpusId masterUserId
mapCgramsId <- listInsertDb lId toNodeNgramsW'
- $ map (first _ngramsTerms . second Map.keys)
- $ Map.toList mapNgramsDocs
+ $ map (first _ngramsTerms . second Map.keys)
+ $ HashMap.toList mapNgramsDocs
+
+ --printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams
- _return <- insertNodeNodeNgrams2
- $ catMaybes [ NodeNodeNgrams2 <$> Just nId
- <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
- <*> Just (fromIntegral w :: Double)
- | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
+ _return <- insertContextNodeNgrams2
+ $ catMaybes [ ContextNodeNgrams2 <$> Just nId
+ <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
+ <*> Just (fromIntegral w :: Double)
+ | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
- , (nId, w) <- Map.toList mapNodeIdWeight
+ , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
]
- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
-- to be removed
- _ <- insertDocNgrams lId indexedNgrams
- pure ids'
+ _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
+
+ pure ()
+
------------------------------------------------------------------------
+-- TODO Type NodeDocumentUnicised
+insertDocs :: ( FlowCmdM env err m
+ -- , FlowCorpus a
+ , FlowInsertDB a
+ )
+ => UserId
+ -> CorpusId
+ -> [a]
+ -> m ([ContextId], [Indexed ContextId a])
+insertDocs uId cId hs = do
+ let docs = map addUniqId hs
+ newIds <- insertDb uId Nothing docs
+ -- printDebug "newIds" newIds
+ let
+ newIds' = map reId newIds
+ documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
+ _ <- Doc.add cId newIds'
+ pure (newIds', documentsWithId)
+
+
------------------------------------------------------------------------
viewUniqId' :: UniqId a
=> a
- -> (HashId, a)
+ -> (Hash, a)
viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
where
err = panic "[ERROR] Database.Flow.toInsert"
toInserted :: [ReturnId]
- -> Map HashId ReturnId
+ -> Map Hash ReturnId
toInserted =
- Map.fromList . map (\r -> (reUniqId r, r) )
+ Map.fromList . map (\r -> (reUniqId r, r) )
. filter (\r -> reInserted r == True)
-mergeData :: Map HashId ReturnId
- -> Map HashId a
- -> [DocumentWithId a]
+mergeData :: Map Hash ReturnId
+ -> Map Hash a
+ -> [Indexed NodeId a]
mergeData rs = catMaybes . map toDocumentWithId . Map.toList
where
toDocumentWithId (sha,hpd) =
- DocumentWithId <$> fmap reId (lookup sha rs)
- <*> Just hpd
+ Indexed <$> fmap reId (lookup sha rs)
+ <*> Just hpd
------------------------------------------------------------------------
-instance HasText HyperdataContact
- where
- hasText = undefined
------------------------------------------------------------------------
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
- -> Cmd err (Map Ngrams (Map NgramsType Int)))
- -> [DocumentWithId a]
- -> Cmd err [DocumentIdWithNgrams a]
+ -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
+ -> [Indexed NodeId a]
+ -> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
- e <- f $ documentData d
- pure $ DocumentIdWithNgrams d e
+ e <- f $ _unIndex d
+ pure $ DocumentIdWithNgrams d e
+
+
+-- | TODO check optimization
+mapNodeIdNgrams :: (Ord b, Hashable b)
+ => [DocumentIdWithNgrams a b]
+ -> HashMap b
+ (Map NgramsType
+ (Map NodeId (Int, TermsCount))
+ )
+mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
+ where
+ -- | NOTE We are somehow multiplying 'TermsCount' here: If the
+ -- same ngrams term has different ngrams types, the 'TermsCount'
+ -- for it (which is the number of times the terms appears in a
+ -- document) is copied over to all its types.
+ f :: DocumentIdWithNgrams a b
+ -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
+ f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
+ where
+ nId = _index $ documentWithId d
+
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
- extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
+ extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
- -> Cmd err (Map Ngrams (Map NgramsType Int))
+ -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
- $ maybe ["Nothing"] (\a -> [a])
- $ view (hc_who . _Just . cw_lastName) hc'
+ $ maybe ["Nothing"] (\a -> [a])
+ $ view (hc_who . _Just . cw_lastName) hc'
- pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
+ pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
-instance HasText HyperdataDocument
- where
- hasText h = catMaybes [ _hd_title h
- , _hd_abstract h
- ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: TermType Lang
-> HyperdataDocument
- -> Cmd err (Map Ngrams (Map NgramsType Int))
- extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
+ -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
+ extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
where
extractNgramsT' :: TermType Lang
-> HyperdataDocument
- -> Cmd err (Map Ngrams (Map NgramsType Int))
+ -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hd_source doc
institutes = map text2ngrams
- $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
+ $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
$ _hd_institutes doc
authors = map text2ngrams
- $ maybe ["Nothing"] (splitOn ", ")
+ $ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc
- terms' <- map text2ngrams
- <$> map (intercalate " " . _terms_label)
- <$> concat
- <$> liftBase (extractTerms lang' $ hasText doc)
+ termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
+ <$> concat
+ <$> liftBase (extractTerms lang' $ hasText doc)
- pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
- <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
- <> [(a', Map.singleton Authors 1) | a' <- authors ]
- <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
+ pure $ HashMap.fromList
+ $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
+ <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
+ <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
+ <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
+instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
+ where
+ extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
+instance HasText a => HasText (Node a)
+ where
+ hasText (Node { _node_hyperdata = h }) = hasText h
+
+
+
+-- | TODO putelsewhere
+-- | Upgrade function
+-- Suppose all documents are English (this is the case actually)
+indexAllDocumentsWithPosTag :: FlowCmdM env err m
+ => m ()
+indexAllDocumentsWithPosTag = do
+ rootId <- getRootId (UserName userMaster)
+ corpusIds <- findNodesId rootId [NodeCorpus]
+ docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
+ _ <- mapM extractInsert (splitEvery 1000 docs)
+ pure ()
+
+extractInsert :: FlowCmdM env err m
+ => [Node HyperdataDocument] -> m ()
+extractInsert docs = do
+ let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
+ mapNgramsDocs' <- mapNodeIdNgrams
+ <$> documentIdWithNgrams
+ (extractNgramsT $ withLang (Multi EN) documentsWithId)
+ documentsWithId
+ _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
+ pure ()