]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FLOW][Database] ngrams extraction and insertion.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 --import Control.Lens (view)
20 --import Gargantext.Core.Types
21 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
22 import Data.Map (Map, lookup)
23 import Data.Maybe (Maybe(..), catMaybes)
24 import Data.Text (Text, splitOn, intercalate)
25 import Data.Tuple.Extra (both, second)
26 import Data.List (concat)
27 import GHC.Show (Show)
28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
29 import Gargantext.Core.Types.Individu (Username)
30 import Gargantext.Core.Types.Main
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Bashql (runCmd') -- , del)
33 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
34 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
35 import Gargantext.Text.Terms (extractTerms)
36 import Gargantext.Database.Node.Document.Add (add)
37 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
38 import Gargantext.Database.Root (getRootCmd)
39 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
40 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
41 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
42 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
43 import Gargantext.Database.Schema.User (getUser, UserLight(..))
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
46 import Gargantext.Database.Utils (Cmd(..))
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
54
55 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
56 flowCorpus ff fp cName = do
57 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
58 params <- flowInsert NodeCorpus hyperdataDocuments' cName
59 flowCorpus' NodeCorpus hyperdataDocuments' params
60
61
62 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
63 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
64 flowInsert _nt hyperdataDocuments cName = do
65 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
66
67 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
68 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
69
70 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
71 _ <- runCmd' $ add userCorpusId (map reId ids)
72
73 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
74
75
76 flowAnnuaire :: FilePath -> IO ()
77 flowAnnuaire filePath = do
78 contacts <- deserialiseImtUsersFromFile filePath
79 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
80 printDebug "length annuaire" ps
81
82
83 flowInsertAnnuaire :: CorpusName -> [ToDbData]
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
86
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
89
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
92
93 printDebug "AnnuaireID" userCorpusId
94
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
96
97
98 flowCorpus' :: NodeType -> [HyperdataDocument]
99 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
100 -> IO CorpusId
101 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
102 --------------------------------------------------
103 -- List Ngrams Flow
104 userListId <- runCmd' $ flowListUser userId userCorpusId
105 printDebug "Working on User ListId : " userListId
106
107 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
108 -- printDebug "documentsWithId" documentsWithId
109 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
110 -- printDebug "docsWithNgrams" docsWithNgrams
111 let maps = mapNodeIdNgrams docsWithNgrams
112
113 -- printDebug "maps" (maps)
114 indexedNgrams <- runCmd' $ indexNgrams maps
115 -- printDebug "inserted ngrams" indexedNgrams
116 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
117
118 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
119 printDebug "Working on ListId : " listId2
120 --}
121 --------------------------------------------------
122 _ <- runCmd' $ mkDashboard userCorpusId userId
123 _ <- runCmd' $ mkGraph userCorpusId userId
124
125 -- Annuaire Flow
126 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
127
128 pure userCorpusId
129 -- runCmd' $ del [corpusId2, corpusId]
130
131 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
132 flowCorpus' _ _ _ = undefined
133
134
135 type CorpusName = Text
136
137 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
138 subFlowCorpus username cName = do
139 maybeUserId <- runCmd' (getUser username)
140
141 let userId = case maybeUserId of
142 Nothing -> panic "Error: User does not exist (yet)"
143 -- mk NodeUser gargantua_id "Node Gargantua"
144 Just user -> userLight_id user
145
146 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
147
148 rootId'' <- case rootId' of
149 [] -> runCmd' (mkRoot username userId)
150 n -> case length n >= 2 of
151 True -> panic "Error: more than 1 userNode / user"
152 False -> pure rootId'
153 let rootId = maybe (panic "error rootId") identity (head rootId'')
154 --{-
155 corpusId'' <- if username == userMaster
156 then do
157 ns <- runCmd' $ getCorporaWithParentId' rootId
158 pure $ map _node_id ns
159 else
160 pure []
161
162 --}
163 corpusId' <- if corpusId'' /= []
164 then pure corpusId''
165 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
166
167 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
168
169 printDebug "(username, userId, rootId, corpusId)"
170 (username, userId, rootId, corpusId)
171 pure (userId, rootId, corpusId)
172
173
174 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
175 subFlowAnnuaire username _cName = do
176 maybeUserId <- runCmd' (getUser username)
177
178 let userId = case maybeUserId of
179 Nothing -> panic "Error: User does not exist (yet)"
180 -- mk NodeUser gargantua_id "Node Gargantua"
181 Just user -> userLight_id user
182
183 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
184
185 rootId'' <- case rootId' of
186 [] -> runCmd' (mkRoot username userId)
187 n -> case length n >= 2 of
188 True -> panic "Error: more than 1 userNode / user"
189 False -> pure rootId'
190 let rootId = maybe (panic "error rootId") identity (head rootId'')
191
192 corpusId' <- runCmd' $ mkAnnuaire rootId userId
193
194 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
195
196 printDebug "(username, userId, rootId, corpusId)"
197 (username, userId, rootId, corpusId)
198 pure (userId, rootId, corpusId)
199
200
201
202 ------------------------------------------------------------------------
203 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
204 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
205 where
206 err = "Database.Flow.toInsert"
207
208 toInserted :: [ReturnId] -> Map HashId ReturnId
209 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
210 . filter (\r -> reInserted r == True)
211
212 data DocumentWithId =
213 DocumentWithId { documentId :: NodeId
214 , documentData :: HyperdataDocument
215 } deriving (Show)
216
217 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
218 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
219 where
220 toDocumentWithId (hash,hpd) =
221 DocumentWithId <$> fmap reId (lookup hash rs)
222 <*> Just hpd
223
224 ------------------------------------------------------------------------
225
226 data DocumentIdWithNgrams =
227 DocumentIdWithNgrams
228 { documentWithId :: DocumentWithId
229 , document_ngrams :: Map (NgramsT Ngrams) Int
230 } deriving (Show)
231
232 -- TODO add Terms (Title + Abstract)
233 -- add f :: Text -> Text
234 -- newtype Ngrams = Ngrams Text
235 -- TODO group terms
236 extractNgramsT :: HyperdataDocument -> IO (Map (NgramsT Ngrams) Int)
237 extractNgramsT doc = do
238
239 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
240 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
241 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
242 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
243 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> extractTerms (Multi EN) leText
244
245 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
246 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
247 <> [(NgramsT Authors a' , 1)| a' <- authors ]
248 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
249
250
251
252
253 documentIdWithNgrams :: (HyperdataDocument -> IO (Map (NgramsT Ngrams) Int))
254 -> [DocumentWithId] -> IO [DocumentIdWithNgrams]
255 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
256 where
257 toDocumentIdWithNgrams d = do
258 e <- f $ documentData d
259 pure $ DocumentIdWithNgrams d e
260
261 -- | TODO check optimization
262 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
263 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
264 where
265 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
266 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
267
268 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
269 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
270 indexNgrams ng2nId = do
271 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
272 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
273
274
275 ------------------------------------------------------------------------
276 ------------------------------------------------------------------------
277 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
278 flowList uId cId ngs = do
279 -- printDebug "ngs:" ngs
280 lId <- getOrMkList cId uId
281 --printDebug "ngs" (DM.keys ngs)
282 -- TODO add stemming equivalence of 2 ngrams
283 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
284 _ <- insertGroups lId groupEd
285
286 -- compute Candidate / Map
287 let lists = ngrams2list ngs
288 -- printDebug "lists:" lists
289
290 is <- insertLists lId lists
291 printDebug "listNgrams inserted :" is
292
293 pure lId
294
295 flowListUser :: UserId -> CorpusId -> Cmd Int
296 flowListUser uId cId = getOrMkList cId uId
297
298 ------------------------------------------------------------------------
299
300 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
301 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
302 -> Map NgramsIndexed NgramsIndexed
303 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
304
305
306
307 -- TODO check: do not insert duplicates
308 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
309 insertGroups lId ngrs =
310 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
311 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
312 , ng1 /= ng2
313 ]
314
315 ------------------------------------------------------------------------
316 -- TODO: verify NgramsT lost here
317 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
318 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
319
320 -- | TODO: weight of the list could be a probability
321 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
322 insertLists lId lngs =
323 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
324 | (l,ngr) <- map (second _ngramsId) lngs
325 ]
326
327 ------------------------------------------------------------------------
328 ------------------------------------------------------------------------
329