2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 --import Control.Lens (view)
20 --import Gargantext.Core.Types
21 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
22 import Data.Map (Map, lookup)
23 import Data.Maybe (Maybe(..), catMaybes)
24 import Data.Text (Text, splitOn, intercalate)
25 import Data.Tuple.Extra (both, second)
26 import Data.List (concat)
27 import GHC.Show (Show)
28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
29 import Gargantext.Core.Types.Individu (Username)
30 import Gargantext.Core.Types.Main
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Bashql (runCmd') -- , del)
33 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
34 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
35 import Gargantext.Text.Terms (extractTerms)
36 import Gargantext.Database.Node.Document.Add (add)
37 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
38 import Gargantext.Database.Root (getRootCmd)
39 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
40 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
41 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
42 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
43 import Gargantext.Database.Schema.User (getUser, UserLight(..))
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
46 import Gargantext.Database.Utils (Cmd(..))
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
55 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
56 flowCorpus ff fp cName = do
57 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
58 params <- flowInsert NodeCorpus hyperdataDocuments' cName
59 flowCorpus' NodeCorpus hyperdataDocuments' params
62 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
63 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
64 flowInsert _nt hyperdataDocuments cName = do
65 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
67 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
68 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
70 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
71 _ <- runCmd' $ add userCorpusId (map reId ids)
73 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
76 flowAnnuaire :: FilePath -> IO ()
77 flowAnnuaire filePath = do
78 contacts <- deserialiseImtUsersFromFile filePath
79 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
80 printDebug "length annuaire" ps
83 flowInsertAnnuaire :: CorpusName -> [ToDbData]
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
93 printDebug "AnnuaireID" userCorpusId
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
98 flowCorpus' :: NodeType -> [HyperdataDocument]
99 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
101 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
102 --------------------------------------------------
104 userListId <- runCmd' $ flowListUser userId userCorpusId
105 printDebug "Working on User ListId : " userListId
107 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
108 -- printDebug "documentsWithId" documentsWithId
109 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
110 -- printDebug "docsWithNgrams" docsWithNgrams
111 let maps = mapNodeIdNgrams docsWithNgrams
113 -- printDebug "maps" (maps)
114 indexedNgrams <- runCmd' $ indexNgrams maps
115 -- printDebug "inserted ngrams" indexedNgrams
116 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
118 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
119 printDebug "Working on ListId : " listId2
121 --------------------------------------------------
122 _ <- runCmd' $ mkDashboard userCorpusId userId
123 _ <- runCmd' $ mkGraph userCorpusId userId
126 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
129 -- runCmd' $ del [corpusId2, corpusId]
131 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
132 flowCorpus' _ _ _ = undefined
135 type CorpusName = Text
137 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
138 subFlowCorpus username cName = do
139 maybeUserId <- runCmd' (getUser username)
141 let userId = case maybeUserId of
142 Nothing -> panic "Error: User does not exist (yet)"
143 -- mk NodeUser gargantua_id "Node Gargantua"
144 Just user -> userLight_id user
146 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
148 rootId'' <- case rootId' of
149 [] -> runCmd' (mkRoot username userId)
150 n -> case length n >= 2 of
151 True -> panic "Error: more than 1 userNode / user"
152 False -> pure rootId'
153 let rootId = maybe (panic "error rootId") identity (head rootId'')
155 corpusId'' <- if username == userMaster
157 ns <- runCmd' $ getCorporaWithParentId' rootId
158 pure $ map _node_id ns
162 corpusId' <- if corpusId'' /= []
164 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
166 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
168 printDebug "(username, userId, rootId, corpusId)"
169 (username, userId, rootId, corpusId)
170 pure (userId, rootId, corpusId)
173 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
174 subFlowAnnuaire username _cName = do
175 maybeUserId <- runCmd' (getUser username)
177 let userId = case maybeUserId of
178 Nothing -> panic "Error: User does not exist (yet)"
179 -- mk NodeUser gargantua_id "Node Gargantua"
180 Just user -> userLight_id user
182 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
184 rootId'' <- case rootId' of
185 [] -> runCmd' (mkRoot username userId)
186 n -> case length n >= 2 of
187 True -> panic "Error: more than 1 userNode / user"
188 False -> pure rootId'
189 let rootId = maybe (panic "error rootId") identity (head rootId'')
191 corpusId' <- runCmd' $ mkAnnuaire rootId userId
193 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
195 printDebug "(username, userId, rootId, corpusId)"
196 (username, userId, rootId, corpusId)
197 pure (userId, rootId, corpusId)
201 ------------------------------------------------------------------------
202 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
203 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
205 err = "Database.Flow.toInsert"
207 toInserted :: [ReturnId] -> Map HashId ReturnId
208 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
209 . filter (\r -> reInserted r == True)
211 data DocumentWithId =
212 DocumentWithId { documentId :: NodeId
213 , documentData :: HyperdataDocument
216 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
217 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
219 toDocumentWithId (hash,hpd) =
220 DocumentWithId <$> fmap reId (lookup hash rs)
223 ------------------------------------------------------------------------
225 data DocumentIdWithNgrams =
227 { documentWithId :: DocumentWithId
228 , document_ngrams :: Map (NgramsT Ngrams) Int
231 -- TODO add Terms (Title + Abstract)
232 -- add f :: Text -> Text
233 -- newtype Ngrams = Ngrams Text
235 extractNgramsT :: HyperdataDocument -> IO (Map (NgramsT Ngrams) Int)
236 extractNgramsT doc = do
238 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
239 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
240 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
241 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
242 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> extractTerms (Multi EN) leText
244 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
245 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
246 <> [(NgramsT Authors a' , 1)| a' <- authors ]
247 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
252 documentIdWithNgrams :: (HyperdataDocument -> IO (Map (NgramsT Ngrams) Int))
253 -> [DocumentWithId] -> IO [DocumentIdWithNgrams]
254 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
256 toDocumentIdWithNgrams d = do
257 e <- f $ documentData d
258 pure $ DocumentIdWithNgrams d e
260 -- | TODO check optimization
261 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
262 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
264 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
265 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
267 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
268 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
269 indexNgrams ng2nId = do
270 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
271 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
274 ------------------------------------------------------------------------
275 ------------------------------------------------------------------------
276 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
277 flowList uId cId ngs = do
278 -- printDebug "ngs:" ngs
279 lId <- getOrMkList cId uId
280 --printDebug "ngs" (DM.keys ngs)
281 -- TODO add stemming equivalence of 2 ngrams
282 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
283 _ <- insertGroups lId groupEd
285 -- compute Candidate / Map
286 let lists = ngrams2list ngs
287 -- printDebug "lists:" lists
289 is <- insertLists lId lists
290 printDebug "listNgrams inserted :" is
294 flowListUser :: UserId -> CorpusId -> Cmd Int
295 flowListUser uId cId = getOrMkList cId uId
297 ------------------------------------------------------------------------
299 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
300 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
301 -> Map NgramsIndexed NgramsIndexed
302 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
306 -- TODO check: do not insert duplicates
307 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
308 insertGroups lId ngrs =
309 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
310 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
314 ------------------------------------------------------------------------
315 -- TODO: verify NgramsT lost here
316 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
317 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
319 -- | TODO: weight of the list could be a probability
320 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
321 insertLists lId lngs =
322 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
323 | (l,ngr) <- map (second _ngramsId) lngs
326 ------------------------------------------------------------------------
327 ------------------------------------------------------------------------