]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[REFACT][FLOW] insertDocs fun
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
26 import Gargantext.Database.Prelude (Cmd)
27 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
28 import Gargantext.Prelude
29 import Gargantext.Core.Text.List.Learn (Model(..))
30 -- import Gargantext.Core.Text.Metrics (takeScored)
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
36
37
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
39 , stemX :: Int
40 , stopSize :: Int
41 }
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
44 | Tficf { nlb_lang :: Lang
45 , nlb_group1 :: Int
46 , nlb_group2 :: Int
47 , nlb_stopSize :: StopSize
48 , nlb_userCorpusId :: UserCorpusId
49 , nlb_masterCorpusId :: MasterCorpusId
50 }
51
52
53 data StopSize = StopSize {unStopSize :: Int}
54
55 -- | TODO improve grouping functions of Authors, Sources, Institutes..
56 buildNgramsLists :: Lang
57 -> Int
58 -> Int
59 -> StopSize
60 -> UserCorpusId
61 -> MasterCorpusId
62 -> Cmd err (Map NgramsType [NgramsElement])
63 buildNgramsLists l n m s uCid mCid = do
64 ngTerms <- buildNgramsTermsList l n m s uCid mCid
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
67
68
69 buildNgramsOthersList :: UserCorpusId
70 -> (Text -> Text)
71 -> NgramsType
72 -> Cmd err (Map NgramsType [NgramsElement])
73 buildNgramsOthersList uCid groupIt nt = do
74 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
75
76 let
77 listSize = 9
78 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
79
80 graphTerms = List.take listSize all'
81 candiTerms = List.drop listSize all'
82
83 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
84 , toElements CandidateTerm candiTerms
85 ]
86 where
87 toElements nType x =
88 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
89 | (t,_ns) <- x
90 ]
91 )]
92
93 {-
94 buildNgramsTermsList' :: UserCorpusId
95 -> (Text -> Text)
96 -> ((Text, (Set Text, Set NodeId)) -> Bool)
97 -> Int
98 -> Int
99 -> Cmd err (Map NgramsType [NgramsElement])
100
101 buildNgramsTermsList' uCid groupIt stop gls is = do
102 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
103
104 let
105 (stops, candidates) = partitionEithers
106 $ map (\t -> if stop t then Left t else Right t)
107 $ Map.toList
108 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
109
110 (maps, candidates') = takeScored gls is
111 $ getCoocByNgrams' snd (Diagonal True)
112 $ Map.fromList candidates
113
114
115 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
116
117 (s,c,m) = (stops
118 , List.filter (\(k,_) -> List.elem k candidates') candidates
119 , List.filter (\(k,_) -> List.elem k maps) candidates
120 )
121
122 let ngs' = List.concat
123 $ map toNgramsElement
124 $ map (\t -> (StopTerm , toList' t)) s
125 <> map (\t -> (CandidateTerm, toList' t)) c
126 <> map (\t -> (MapTerm , toList' t)) m
127
128 pure $ Map.fromList [(NgramsTerms, ngs')]
129 -}
130
131
132
133
134 buildNgramsTermsList :: Lang
135 -> Int
136 -> Int
137 -> StopSize
138 -> UserCorpusId
139 -> MasterCorpusId
140 -> Cmd err (Map NgramsType [NgramsElement])
141 buildNgramsTermsList l n m s uCid mCid = do
142 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
143
144 let
145 candidatesSize = 400
146 {-
147 a = 50
148 b = 50
149 -}
150 candidatesHead = List.take candidatesSize candidates
151 candidatesTail = List.drop candidatesSize candidates
152
153 termList =
154 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
155 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
156 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
157
158 ngs = List.concat $ map toNgramsElement termList
159
160 pure $ Map.fromList [(NgramsTerms, ngs)]
161
162
163 toTermList :: Int
164 -> Int
165 -> (a -> Bool)
166 -> [a]
167 -> [(ListType, a)]
168 toTermList _ _ _ [] = []
169 toTermList a b stop ns = -- trace ("computing toTermList") $
170 map (toGargList stop CandidateTerm) xs
171 <> map (toGargList stop MapTerm) ys
172 <> toTermList a b stop zs
173 where
174 xs = take a ns
175 xz = drop a ns
176
177 ys = take b xz
178 zs = drop b xz
179
180
181 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
182 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
183 case Set.toList setNgrams of
184 [] -> []
185 (parent:children) -> [parentElem] <> childrenElems
186 where
187 parentElem = mkNgramsElement parent
188 listType
189 Nothing
190 (mSetFromList children)
191 childrenElems = map (\t -> mkNgramsElement t listType
192 (Just $ RootParent parent parent)
193 (mSetFromList [])
194 ) children
195
196
197 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
198 toGargList stop l n = case stop n of
199 True -> (StopTerm, n)
200 False -> (l, n)
201
202
203 isStopTerm :: StopSize -> Text -> Bool
204 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
205 where
206 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)