]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
Adding ngrams to the table now has a proper patch!
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
37
38
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
40 , stemX :: Int
41 , stopSize :: Int
42 }
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
46 , nlb_group1 :: Int
47 , nlb_group2 :: Int
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
51 }
52
53
54 data StopSize = StopSize {unStopSize :: Int}
55
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
58 -> Int
59 -> Int
60 -> StopSize
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
68
69
70 buildNgramsOthersList :: UserCorpusId
71 -> (Text -> Text)
72 -> NgramsType
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
76
77 let
78 listSize = 9
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
80
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
83
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
86 ]
87 where
88 toElements nType x =
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
90 | (t,_ns) <- x
91 ]
92 )]
93
94 {-
95 buildNgramsTermsList' :: UserCorpusId
96 -> (Text -> Text)
97 -> ((Text, (Set Text, Set NodeId)) -> Bool)
98 -> Int
99 -> Int
100 -> Cmd err (Map NgramsType [NgramsElement])
101
102 buildNgramsTermsList' uCid groupIt stop gls is = do
103 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
104
105 let
106 (stops, candidates) = partitionEithers
107 $ map (\t -> if stop t then Left t else Right t)
108 $ Map.toList
109 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
110
111 (maps, candidates') = takeScored gls is
112 $ getCoocByNgrams' snd (Diagonal True)
113 $ Map.fromList candidates
114
115
116 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
117
118 (s,c,m) = (stops
119 , List.filter (\(k,_) -> List.elem k candidates') candidates
120 , List.filter (\(k,_) -> List.elem k maps) candidates
121 )
122
123 let ngs' = List.concat
124 $ map toNgramsElement
125 $ map (\t -> (StopTerm , toList' t)) s
126 <> map (\t -> (CandidateTerm, toList' t)) c
127 <> map (\t -> (MapTerm , toList' t)) m
128
129 pure $ Map.fromList [(NgramsTerms, ngs')]
130 -}
131
132
133
134
135 buildNgramsTermsList :: Lang
136 -> Int
137 -> Int
138 -> StopSize
139 -> UserCorpusId
140 -> MasterCorpusId
141 -> Cmd err (Map NgramsType [NgramsElement])
142 buildNgramsTermsList _l _n _m s uCid mCid = do
143 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms
144 printDebug "head candidates" (List.take 10 $ candidates)
145 printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
146
147 let
148 candidatesSize = 400
149 candidatesHead = List.take candidatesSize candidates
150 candidatesTail = List.drop candidatesSize candidates
151
152 termList =
153 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
154 (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
155 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
156
157 ngs = List.concat $ map toNgramsElement $ map (\(lt, (t,d)) -> (lt, ((t, (d,Set.singleton t))))) termList
158
159 pure $ Map.fromList [(NgramsTerms, ngs)]
160
161
162 toTermList :: Int
163 -> Int
164 -> (a -> Bool)
165 -> [a]
166 -> [(ListType, a)]
167 toTermList _ _ _ [] = []
168 toTermList a b stop ns = -- trace ("computing toTermList") $
169 map (toGargList stop CandidateTerm) xs
170 <> map (toGargList stop MapTerm) ys
171 <> toTermList a b stop zs
172 where
173 xs = take a ns
174 xz = drop a ns
175
176 ys = take b xz
177 zs = drop b xz
178
179
180 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
181 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
182 case Set.toList setNgrams of
183 [] -> []
184 (parent:children) -> [parentElem] <> childrenElems
185 where
186 parentElem = mkNgramsElement parent
187 listType
188 Nothing
189 (mSetFromList children)
190 childrenElems = map (\t -> mkNgramsElement t listType
191 (Just $ RootParent parent parent)
192 (mSetFromList [])
193 ) children
194
195
196 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
197 toGargList stop l n = case stop n of
198 True -> (StopTerm, n)
199 False -> (l, n)
200
201
202 isStopTerm :: StopSize -> Text -> Bool
203 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
204 where
205 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)