]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[FIX] encoding update node
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Prelude (Cmd)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Prelude
28 import Gargantext.Text.List.Learn (Model(..))
29 -- import Gargantext.Text.Metrics (takeScored)
30 import qualified Data.Char as Char
31 import qualified Data.List as List
32 import qualified Data.Map as Map
33 import qualified Data.Set as Set
34 import qualified Data.Text as Text
35
36
37 data NgramsListBuilder = BuilderStepO { stemSize :: Int
38 , stemX :: Int
39 , stopSize :: Int
40 }
41 | BuilderStep1 { withModel :: Model }
42 | BuilderStepN { withModel :: Model }
43 | Tficf { nlb_lang :: Lang
44 , nlb_group1 :: Int
45 , nlb_group2 :: Int
46 , nlb_stopSize :: StopSize
47 , nlb_userCorpusId :: UserCorpusId
48 , nlb_masterCorpusId :: MasterCorpusId
49 }
50
51
52 data StopSize = StopSize {unStopSize :: Int}
53
54 -- | TODO improve grouping functions of Authors, Sources, Institutes..
55 buildNgramsLists :: Lang
56 -> Int
57 -> Int
58 -> StopSize
59 -> UserCorpusId
60 -> MasterCorpusId
61 -> Cmd err (Map NgramsType [NgramsElement])
62 buildNgramsLists l n m s uCid mCid = do
63 ngTerms <- buildNgramsTermsList l n m s uCid mCid
64 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
65 pure $ Map.unions $ othersTerms <> [ngTerms]
66
67
68 buildNgramsOthersList :: UserCorpusId
69 -> (Text -> Text)
70 -> NgramsType
71 -> Cmd err (Map NgramsType [NgramsElement])
72 buildNgramsOthersList uCid groupIt nt = do
73 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
74
75 let
76 listSize = 9
77 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
78
79 graphTerms = List.take listSize all'
80 candiTerms = List.drop listSize all'
81
82 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
83 , toElements CandidateTerm candiTerms
84 ]
85 where
86 toElements nType x =
87 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
88 | (t,_ns) <- x
89 ]
90 )]
91
92 {-
93 buildNgramsTermsList' :: UserCorpusId
94 -> (Text -> Text)
95 -> ((Text, (Set Text, Set NodeId)) -> Bool)
96 -> Int
97 -> Int
98 -> Cmd err (Map NgramsType [NgramsElement])
99
100 buildNgramsTermsList' uCid groupIt stop gls is = do
101 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
102
103 let
104 (stops, candidates) = partitionEithers
105 $ map (\t -> if stop t then Left t else Right t)
106 $ Map.toList
107 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
108
109 (maps, candidates') = takeScored gls is
110 $ getCoocByNgrams' snd (Diagonal True)
111 $ Map.fromList candidates
112
113
114 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
115
116 (s,c,m) = (stops
117 , List.filter (\(k,_) -> List.elem k candidates') candidates
118 , List.filter (\(k,_) -> List.elem k maps) candidates
119 )
120
121 let ngs' = List.concat
122 $ map toNgramsElement
123 $ map (\t -> (StopTerm , toList' t)) s
124 <> map (\t -> (CandidateTerm, toList' t)) c
125 <> map (\t -> (GraphTerm , toList' t)) m
126
127 pure $ Map.fromList [(NgramsTerms, ngs')]
128 -}
129
130
131
132
133 buildNgramsTermsList :: Lang
134 -> Int
135 -> Int
136 -> StopSize
137 -> UserCorpusId
138 -> MasterCorpusId
139 -> Cmd err (Map NgramsType [NgramsElement])
140 buildNgramsTermsList l n m s uCid mCid = do
141 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
142
143 let
144 candidatesSize = 400
145 {-
146 a = 50
147 b = 50
148 -}
149 candidatesHead = List.take candidatesSize candidates
150 candidatesTail = List.drop candidatesSize candidates
151
152 termList =
153 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
154 (map (toGargList ((isStopTerm s) .fst) GraphTerm) candidatesHead)
155 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
156
157 ngs = List.concat $ map toNgramsElement termList
158
159 pure $ Map.fromList [(NgramsTerms, ngs)]
160
161
162 toTermList :: Int
163 -> Int
164 -> (a -> Bool)
165 -> [a]
166 -> [(ListType, a)]
167 toTermList _ _ _ [] = []
168 toTermList a b stop ns = -- trace ("computing toTermList") $
169 map (toGargList stop CandidateTerm) xs
170 <> map (toGargList stop GraphTerm) ys
171 <> toTermList a b stop zs
172 where
173 xs = take a ns
174 xz = drop a ns
175
176 ys = take b xz
177 zs = drop b xz
178
179
180 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
181 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
182 case Set.toList setNgrams of
183 [] -> []
184 (parent:children) -> [parentElem] <> childrenElems
185 where
186 parentElem = mkNgramsElement parent
187 listType
188 Nothing
189 (mSetFromList children)
190 childrenElems = map (\t -> mkNgramsElement t listType
191 (Just $ RootParent parent parent)
192 (mSetFromList [])
193 ) children
194
195
196 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
197 toGargList stop l n = case stop n of
198 True -> (StopTerm, n)
199 False -> (l, n)
200
201
202
203 isStopTerm :: StopSize -> Text -> Bool
204 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
205 where
206 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)