]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NGRAMS-TABLE] use real
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both, second)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Text.Terms (extractTerms)
37 import Gargantext.Database.Node.Document.Add (add)
38 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
39 import Gargantext.Database.Root (getRoot)
40 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
41 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError)
42 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Schema.User (getUser, UserLight(..))
45 import Gargantext.Database.Types.Node (HyperdataDocument(..))
46 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
47 import Gargantext.Database.Utils (Cmd)
48 import Gargantext.Text.Terms (TermType(..))
49 import Gargantext.Ext.IMT (toSchoolName)
50 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
51 import Gargantext.Prelude
52 import Gargantext.Text.Parsers (parseDocs, FileFormat)
53 import System.FilePath (FilePath)
54 import qualified Data.Map as DM
55
56 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
57 flowCorpus ff fp cName = do
58 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
59 params <- flowInsert NodeCorpus hyperdataDocuments' cName
60 flowCorpus' NodeCorpus hyperdataDocuments' params
61
62
63 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
64 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
65 flowInsert _nt hyperdataDocuments cName = do
66 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
67
68 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
69 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
70
71 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
72 _ <- add userCorpusId (map reId ids)
73
74 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
75
76
77 flowAnnuaire :: FilePath -> Cmd err ()
78 flowAnnuaire filePath = do
79 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
80 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
81 printDebug "length annuaire" ps
82
83
84 flowInsertAnnuaire :: CorpusName -> [ToDbData]
85 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
86 flowInsertAnnuaire name children = do
87
88 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
89 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
90
91 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
92 _ <- add userCorpusId (map reId ids)
93
94 printDebug "AnnuaireID" userCorpusId
95
96 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
97
98
99 flowCorpus' :: HasNodeError err
100 => NodeType -> [HyperdataDocument]
101 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
102 -> Cmd err CorpusId
103 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
104 --------------------------------------------------
105 -- List Ngrams Flow
106 userListId <- flowListUser userId userCorpusId
107 printDebug "Working on User ListId : " userListId
108
109 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
110 -- printDebug "documentsWithId" documentsWithId
111 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
112 -- printDebug "docsWithNgrams" docsWithNgrams
113 let maps = mapNodeIdNgrams docsWithNgrams
114
115 -- printDebug "maps" (maps)
116 indexedNgrams <- indexNgrams maps
117 -- printDebug "inserted ngrams" indexedNgrams
118 _ <- insertToNodeNgrams indexedNgrams
119
120 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
121 printDebug "Working on ListId : " listId2
122 --}
123 --------------------------------------------------
124 _ <- mkDashboard userCorpusId userId
125 _ <- mkGraph userCorpusId userId
126
127 -- Annuaire Flow
128 -- _ <- mkAnnuaire rootUserId userId
129
130 pure userCorpusId
131 -- del [corpusId2, corpusId]
132
133 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
134 flowCorpus' _ _ _ = undefined
135
136
137 type CorpusName = Text
138
139 subFlowCorpus :: Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
140 subFlowCorpus username cName = do
141 maybeUserId <- getUser username
142
143 let userId = case maybeUserId of
144 Nothing -> panic "Error: User does not exist (yet)"
145 -- mk NodeUser gargantua_id "Node Gargantua"
146 Just user -> userLight_id user
147
148 rootId' <- map _node_id <$> getRoot username
149
150 rootId'' <- case rootId' of
151 [] -> mkRoot username userId
152 n -> case length n >= 2 of
153 True -> panic "Error: more than 1 userNode / user"
154 False -> pure rootId'
155 let rootId = maybe (panic "error rootId") identity (head rootId'')
156
157 corpusId'' <- if username == userMaster
158 then do
159 ns <- getCorporaWithParentId rootId
160 pure $ map _node_id ns
161 else
162 pure []
163
164 corpusId' <- if corpusId'' /= []
165 then pure corpusId''
166 else mkCorpus (Just cName) Nothing rootId userId
167
168 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
169
170 printDebug "(username, userId, rootId, corpusId)"
171 (username, userId, rootId, corpusId)
172 pure (userId, rootId, corpusId)
173
174
175 subFlowAnnuaire :: Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
176 subFlowAnnuaire username _cName = do
177 maybeUserId <- getUser username
178
179 let userId = case maybeUserId of
180 Nothing -> panic "Error: User does not exist (yet)"
181 -- mk NodeUser gargantua_id "Node Gargantua"
182 Just user -> userLight_id user
183
184 rootId' <- map _node_id <$> getRoot username
185
186 rootId'' <- case rootId' of
187 [] -> mkRoot username userId
188 n -> case length n >= 2 of
189 True -> panic "Error: more than 1 userNode / user"
190 False -> pure rootId'
191 let rootId = maybe (panic "error rootId") identity (head rootId'')
192
193 corpusId' <- mkAnnuaire rootId userId
194
195 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
196
197 printDebug "(username, userId, rootId, corpusId)"
198 (username, userId, rootId, corpusId)
199 pure (userId, rootId, corpusId)
200
201
202
203 ------------------------------------------------------------------------
204 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
205 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
206 where
207 err = "Database.Flow.toInsert"
208
209 toInserted :: [ReturnId] -> Map HashId ReturnId
210 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
211 . filter (\r -> reInserted r == True)
212
213 data DocumentWithId =
214 DocumentWithId { documentId :: !NodeId
215 , documentData :: !HyperdataDocument
216 } deriving (Show)
217
218 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
219 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
220 where
221 toDocumentWithId (hash,hpd) =
222 DocumentWithId <$> fmap reId (lookup hash rs)
223 <*> Just hpd
224
225 ------------------------------------------------------------------------
226
227 data DocumentIdWithNgrams =
228 DocumentIdWithNgrams
229 { documentWithId :: !DocumentWithId
230 , document_ngrams :: !(Map (NgramsT Ngrams) Int)
231 } deriving (Show)
232
233 -- TODO add Terms (Title + Abstract)
234 -- add f :: Text -> Text
235 -- newtype Ngrams = Ngrams Text
236 -- TODO group terms
237 extractNgramsT :: HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int)
238 extractNgramsT doc = do
239
240 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
241 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
242 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
243 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
244 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
245
246 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
247 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
248 <> [(NgramsT Authors a' , 1)| a' <- authors ]
249 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
250
251
252
253
254 documentIdWithNgrams :: (HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int))
255 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
256 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
257 where
258 toDocumentIdWithNgrams d = do
259 e <- f $ documentData d
260 pure $ DocumentIdWithNgrams d e
261
262 -- | TODO check optimization
263 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
264 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
265 where
266 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
267 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
268
269 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
270 -> Cmd err (Map (NgramsT NgramsIndexed) (Map NodeId Int))
271 indexNgrams ng2nId = do
272 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
273 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
274
275
276 ------------------------------------------------------------------------
277 ------------------------------------------------------------------------
278 flowList :: HasNodeError err => UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd err ListId
279 flowList uId cId ngs = do
280 -- printDebug "ngs:" ngs
281 lId <- getOrMkList cId uId
282 --printDebug "ngs" (DM.keys ngs)
283 -- TODO add stemming equivalence of 2 ngrams
284 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
285 _ <- insertGroups lId groupEd
286
287 -- compute Candidate / Map
288 let lists = ngrams2list ngs
289 -- printDebug "lists:" lists
290
291 is <- insertLists lId lists
292 printDebug "listNgrams inserted :" is
293
294 pure lId
295
296 flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err Int
297 flowListUser uId cId = getOrMkList cId uId
298
299 ------------------------------------------------------------------------
300
301 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
302 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
303 -> Map NgramsIndexed NgramsIndexed
304 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
305
306
307
308 -- TODO check: do not insert duplicates
309 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
310 insertGroups lId ngrs =
311 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
312 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
313 , ng1 /= ng2
314 ]
315
316 ------------------------------------------------------------------------
317 -- TODO: verify NgramsT lost here
318 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
319 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
320
321 -- | TODO: weight of the list could be a probability
322 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd err Int
323 insertLists lId lngs =
324 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
325 | (l,ngr) <- map (second _ngramsId) lngs
326 ]
327
328 ------------------------------------------------------------------------
329 ------------------------------------------------------------------------