]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[DB] Trigger for sha256sum (WIP)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
37
38
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
40 , stemX :: Int
41 , stopSize :: Int
42 }
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
46 , nlb_group1 :: Int
47 , nlb_group2 :: Int
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
51 }
52
53
54 data StopSize = StopSize {unStopSize :: Int}
55
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
58 -> Int
59 -> Int
60 -> StopSize
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
68
69
70 buildNgramsOthersList :: UserCorpusId
71 -> (Text -> Text)
72 -> NgramsType
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
76
77 let
78 listSize = 9
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
80
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
83
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
86 ]
87 where
88 toElements nType x =
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
90 | (t,_ns) <- x
91 ]
92 )]
93
94 {-
95 buildNgramsTermsList' :: UserCorpusId
96 -> (Text -> Text)
97 -> ((Text, (Set Text, Set NodeId)) -> Bool)
98 -> Int
99 -> Int
100 -> Cmd err (Map NgramsType [NgramsElement])
101
102 buildNgramsTermsList' uCid groupIt stop gls is = do
103 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
104
105 let
106 (stops, candidates) = partitionEithers
107 $ map (\t -> if stop t then Left t else Right t)
108 $ Map.toList
109 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
110
111 (maps, candidates') = takeScored gls is
112 $ getCoocByNgrams' snd (Diagonal True)
113 $ Map.fromList candidates
114
115
116 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
117
118 (s,c,m) = (stops
119 , List.filter (\(k,_) -> List.elem k candidates') candidates
120 , List.filter (\(k,_) -> List.elem k maps) candidates
121 )
122
123 let ngs' = List.concat
124 $ map toNgramsElement
125 $ map (\t -> (StopTerm , toList' t)) s
126 <> map (\t -> (CandidateTerm, toList' t)) c
127 <> map (\t -> (MapTerm , toList' t)) m
128
129 pure $ Map.fromList [(NgramsTerms, ngs')]
130 -}
131
132
133
134
135 buildNgramsTermsList :: Lang
136 -> Int
137 -> Int
138 -> StopSize
139 -> UserCorpusId
140 -> MasterCorpusId
141 -> Cmd err (Map NgramsType [NgramsElement])
142 buildNgramsTermsList _l _n _m s uCid mCid = do
143 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms
144
145 let
146 candidatesSize = 400
147 {-
148 a = 50
149 b = 50
150 -}
151 candidatesHead = List.take candidatesSize candidates
152 candidatesTail = List.drop candidatesSize candidates
153
154 termList =
155 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
156 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
157 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
158
159 ngs = List.concat $ map toNgramsElement $ map (\(lt, (t,d)) -> (lt, ((t, (d,Set.singleton t))))) termList
160
161 pure $ Map.fromList [(NgramsTerms, ngs)]
162
163
164 toTermList :: Int
165 -> Int
166 -> (a -> Bool)
167 -> [a]
168 -> [(ListType, a)]
169 toTermList _ _ _ [] = []
170 toTermList a b stop ns = -- trace ("computing toTermList") $
171 map (toGargList stop CandidateTerm) xs
172 <> map (toGargList stop MapTerm) ys
173 <> toTermList a b stop zs
174 where
175 xs = take a ns
176 xz = drop a ns
177
178 ys = take b xz
179 zs = drop b xz
180
181
182 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
183 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
184 case Set.toList setNgrams of
185 [] -> []
186 (parent:children) -> [parentElem] <> childrenElems
187 where
188 parentElem = mkNgramsElement parent
189 listType
190 Nothing
191 (mSetFromList children)
192 childrenElems = map (\t -> mkNgramsElement t listType
193 (Just $ RootParent parent parent)
194 (mSetFromList [])
195 ) children
196
197
198 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
199 toGargList stop l n = case stop n of
200 True -> (StopTerm, n)
201 False -> (l, n)
202
203
204 isStopTerm :: StopSize -> Text -> Bool
205 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
206 where
207 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)