]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NGRAMS] fixes
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow (flowDatabase)
26 where
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..), catMaybes)
29 import Data.Text (Text)
30 import Data.Map (Map)
31 import Data.Tuple.Extra (both, second)
32 import qualified Data.Map as DM
33
34 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
35 import Gargantext.Database.Bashql (runCmd')--, del)
36 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
37 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
38 import Gargantext.Database.Node.Document.Add (add)
39 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
40 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
41 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
42 import Gargantext.Database.Types.Node (HyperdataDocument(..))
43 import Gargantext.Database.User (getUser, UserLight(..), Username)
44 import Gargantext.Prelude
45 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
46
47 type UserId = Int
48 type RootId = Int
49 type CorpusId = Int
50
51 flowDatabase :: FilePath -> CorpusName -> IO [Int]
52 flowDatabase fp cName = do
53
54 -- Corus Flow
55 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
56
57 -- Documents Flow
58 hyperdataDocuments <- map addUniqIds <$> parseDocs WOS fp
59 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
60 printDebug "Docs IDs : " (length ids)
61 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
62
63 -- Ngrams Flow
64 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
65 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
66 let maps = mapNodeIdNgrams docsWithNgrams
67 indexedNgrams <- runCmd' $ indexNgrams maps
68 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
69
70 -- List Flow
71 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
72 printDebug "list id:" listId2
73
74 printDebug "Docs IDs : " (length idsRepeat)
75
76 (_, _, corpusId2) <- subFlow "alexandre" cName
77 inserted <- runCmd' $ add corpusId2 (map reId ids)
78 printDebug "Inserted : " (length inserted)
79
80 pure [corpusId2, corpusId]
81
82 --runCmd' $ del [corpusId2, corpusId]
83
84 type CorpusName = Text
85
86 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
87 subFlow username cName = do
88 maybeUserId <- runCmd' (getUser username)
89
90 let userId = case maybeUserId of
91 Nothing -> panic "Error: User does not exist (yet)"
92 -- mk NodeUser gargantua_id "Node Gargantua"
93 Just user -> userLight_id user
94
95 rootId' <- map _node_id <$> runCmd' (getRoot userId)
96
97 rootId'' <- case rootId' of
98 [] -> runCmd' (mkRoot userId)
99 n -> case length n >= 2 of
100 True -> panic "Error: more than 1 userNode / user"
101 False -> pure rootId'
102 let rootId = maybe (panic "error rootId") identity (head rootId'')
103
104 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
105 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
106
107 printDebug "(username, userId, rootId, corpusId)"
108 (username, userId, rootId, corpusId)
109 pure (userId, rootId, corpusId)
110
111 ------------------------------------------------------------------------
112
113
114 type HashId = Text
115 type NodeId = Int
116 type ListId = Int
117
118 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
119 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
120 where
121 hash = maybe "Error" identity
122
123 toInserted :: [ReturnId] -> Map HashId ReturnId
124 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
125 $ filter (\r -> reInserted r == True) rs
126
127 data DocumentWithId =
128 DocumentWithId { documentId :: NodeId
129 , documentData :: HyperdataDocument
130 }
131
132 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
133 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
134 where
135 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
136
137 ------------------------------------------------------------------------
138
139 data DocumentIdWithNgrams =
140 DocumentIdWithNgrams
141 { documentWithId :: DocumentWithId
142 , document_ngrams :: Map (NgramsT Ngrams) Int
143 }
144
145 -- TODO add Authors and Terms (Title + Abstract)
146 -- add f :: Text -> Text
147 -- newtype Ngrams = Ngrams Text
148 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
149 extractNgramsT doc = DM.fromList $ [(NgramsT Sources ngrams, 1)]
150 where
151 ngrams = text2ngrams $ maybe "Nothing" identity maybeNgrams
152 maybeNgrams = _hyperdataDocument_source doc
153
154 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
155 -> [DocumentWithId] -> [DocumentIdWithNgrams]
156 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
157
158 -- | TODO check optimization
159 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
160 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
161 where
162 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
163 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
164
165 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
166 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
167 indexNgrams ng2nId = do
168 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
169 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
170
171
172 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
173 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
174 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
175 | (ng, nId2int) <- DM.toList m
176 , (nId, n) <- DM.toList nId2int
177 ]
178
179
180 ------------------------------------------------------------------------
181 ------------------------------------------------------------------------
182 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
183 listFlow uId cId ng = do
184 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
185 -- TODO add stemming equivalence of 2 ngrams
186 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ng
187 _ <- insertGroups lId groupEd
188
189 -- compute Candidate / Map
190 let lists = ngrams2list ng
191 _ <- insertLists lId lists
192
193 pure lId
194
195 ------------------------------------------------------------------------
196
197 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
198 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
199 -> Map NgramsIndexed NgramsIndexed
200 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
201
202
203
204 -- TODO check: do not insert duplicates
205 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
206 insertGroups lId ngrs =
207 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
208 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
209 ]
210
211 ------------------------------------------------------------------------
212 -- TODO: verify NgramsT lost here
213 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
214 ngrams2list = DM.fromList . zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
215
216 -- | TODO: weight of the list could be a probability
217 insertLists :: ListId -> Map ListType NgramsIndexed -> Cmd Int
218 insertLists lId list2ngrams =
219 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
220 | (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams
221 ]
222
223 ------------------------------------------------------------------------
224 ------------------------------------------------------------------------
225