2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both, second)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Text.Terms (extractTerms)
37 import Gargantext.Database.Node.Document.Add (add)
38 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
39 import Gargantext.Database.Root (getRoot)
40 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
41 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
42 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Schema.User (getUser, UserLight(..))
45 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
46 import Gargantext.Database.Utils (Cmd)
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
56 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
57 flowCorpus ff fp cName = do
58 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
59 params <- flowInsert NodeCorpus hyperdataDocuments' cName
60 flowCorpus' NodeCorpus hyperdataDocuments' params
63 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
64 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
65 flowInsert _nt hyperdataDocuments cName = do
66 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
68 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
69 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
71 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
72 _ <- add userCorpusId (map reId ids)
74 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
77 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
78 flowAnnuaire filePath = do
79 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
80 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
81 printDebug "length annuaire" ps
84 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
85 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
86 flowInsertAnnuaire name children = do
88 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
89 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
91 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
92 _ <- add userCorpusId (map reId ids)
94 printDebug "AnnuaireID" userCorpusId
96 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
99 flowCorpus' :: HasNodeError err
100 => NodeType -> [HyperdataDocument]
101 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
103 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
104 --------------------------------------------------
106 userListId <- flowListUser userId userCorpusId
107 printDebug "Working on User ListId : " userListId
109 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
110 -- printDebug "documentsWithId" documentsWithId
111 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
112 -- printDebug "docsWithNgrams" docsWithNgrams
113 let maps = mapNodeIdNgrams docsWithNgrams
115 -- printDebug "maps" (maps)
116 indexedNgrams <- indexNgrams maps
117 -- printDebug "inserted ngrams" indexedNgrams
118 _ <- insertToNodeNgrams indexedNgrams
120 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
121 printDebug "Working on ListId : " listId2
123 --------------------------------------------------
124 _ <- mkDashboard userCorpusId userId
125 _ <- mkGraph userCorpusId userId
128 -- _ <- mkAnnuaire rootUserId userId
131 -- del [corpusId2, corpusId]
133 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
134 flowCorpus' _ _ _ = undefined
137 type CorpusName = Text
139 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
140 subFlowCorpus username cName = do
141 maybeUserId <- getUser username
143 userId <- case maybeUserId of
144 Nothing -> nodeError NoUserFound
145 -- mk NodeUser gargantua_id "Node Gargantua"
146 Just user -> pure $ userLight_id user
148 rootId' <- map _node_id <$> getRoot username
150 rootId'' <- case rootId' of
151 [] -> mkRoot username userId
152 n -> case length n >= 2 of
153 True -> nodeError ManyNodeUsers
154 False -> pure rootId'
155 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
157 corpusId'' <- if username == userMaster
159 ns <- getCorporaWithParentId rootId
160 pure $ map _node_id ns
164 corpusId' <- if corpusId'' /= []
166 else mkCorpus (Just cName) Nothing rootId userId
168 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
170 printDebug "(username, userId, rootId, corpusId)"
171 (username, userId, rootId, corpusId)
172 pure (userId, rootId, corpusId)
175 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
176 subFlowAnnuaire username _cName = do
177 maybeUserId <- getUser username
179 userId <- case maybeUserId of
180 Nothing -> nodeError NoUserFound
181 -- mk NodeUser gargantua_id "Node Gargantua"
182 Just user -> pure $ userLight_id user
184 rootId' <- map _node_id <$> getRoot username
186 rootId'' <- case rootId' of
187 [] -> mkRoot username userId
188 n -> case length n >= 2 of
189 True -> nodeError ManyNodeUsers
190 False -> pure rootId'
191 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
193 corpusId' <- mkAnnuaire rootId userId
195 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
197 printDebug "(username, userId, rootId, corpusId)"
198 (username, userId, rootId, corpusId)
199 pure (userId, rootId, corpusId)
201 ------------------------------------------------------------------------
202 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
203 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
205 err = "Database.Flow.toInsert"
207 toInserted :: [ReturnId] -> Map HashId ReturnId
208 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
209 . filter (\r -> reInserted r == True)
211 data DocumentWithId =
212 DocumentWithId { documentId :: !NodeId
213 , documentData :: !HyperdataDocument
216 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
217 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
219 toDocumentWithId (hash,hpd) =
220 DocumentWithId <$> fmap reId (lookup hash rs)
223 ------------------------------------------------------------------------
224 data DocumentIdWithNgrams =
226 { documentWithId :: !DocumentWithId
227 , document_ngrams :: !(Map (NgramsT Ngrams) Int)
231 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int)
232 extractNgramsT doc = do
234 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
235 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
236 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
237 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
238 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
240 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
241 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
242 <> [(NgramsT Authors a' , 1)| a' <- authors ]
243 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
247 documentIdWithNgrams :: HasNodeError err => (HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int))
248 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
249 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
251 toDocumentIdWithNgrams d = do
252 e <- f $ documentData d
253 pure $ DocumentIdWithNgrams d e
255 -- | TODO check optimization
256 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
257 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
259 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
260 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
262 indexNgrams :: HasNodeError err => Map (NgramsT Ngrams ) (Map NodeId Int)
263 -> Cmd err (Map (NgramsT NgramsIndexed) (Map NodeId Int))
264 indexNgrams ng2nId = do
265 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
266 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
268 ------------------------------------------------------------------------
269 flowList :: HasNodeError err => UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd err ListId
270 flowList uId cId ngs = do
271 -- printDebug "ngs:" ngs
272 lId <- getOrMkList cId uId
273 --printDebug "ngs" (DM.keys ngs)
274 -- TODO add stemming equivalence of 2 ngrams
275 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
276 _ <- insertGroups lId groupEd
278 -- compute Candidate / Map
279 let lists = ngrams2list ngs
280 -- printDebug "lists:" lists
282 is <- insertLists lId lists
283 printDebug "listNgrams inserted :" is
287 flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err NodeId
288 flowListUser uId cId = getOrMkList cId uId
290 ------------------------------------------------------------------------
292 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
293 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
294 -> Map NgramsIndexed NgramsIndexed
295 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
299 -- TODO check: do not insert duplicates
300 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
301 insertGroups lId ngrs =
302 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
303 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
307 ------------------------------------------------------------------------
308 -- TODO: verify NgramsT lost here
309 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
310 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
312 -- | TODO: weight of the list could be a probability
313 insertLists :: HasNodeError err => ListId -> [(ListType,NgramsIndexed)] -> Cmd err Int
314 insertLists lId lngs =
315 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
316 | (l,ngr) <- map (second _ngramsId) lngs
318 ------------------------------------------------------------------------