]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[DB/FACT] Gargantext.Database.Prelude
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Text.List
18 where
19
20 -- import Data.Either (partitionEithers, Either(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
28 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Prelude (Cmd)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Prelude
32 import Gargantext.Text.List.Learn (Model(..))
33 -- import Gargantext.Text.Metrics (takeScored)
34 import qualified Data.Char as Char
35 import qualified Data.List as List
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39
40
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 , stemX :: Int
43 , stopSize :: Int
44 }
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
48 , nlb_group1 :: Int
49 , nlb_group2 :: Int
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
53 }
54
55
56 data StopSize = StopSize {unStopSize :: Int}
57
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
60 -> Int
61 -> Int
62 -> StopSize
63 -> UserCorpusId
64 -> MasterCorpusId
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
70
71
72 buildNgramsOthersList :: UserCorpusId
73 -> (Text -> Text)
74 -> NgramsType
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78
79 let
80 listSize = 9
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
82
83 graphTerms = List.take listSize all'
84 candiTerms = List.drop listSize all'
85
86 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
87 , toElements CandidateTerm candiTerms
88 ]
89 where
90 toElements nType x =
91 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
92 | (t,_ns) <- x
93 ]
94 )]
95
96 {-
97 buildNgramsTermsList' :: UserCorpusId
98 -> (Text -> Text)
99 -> ((Text, (Set Text, Set NodeId)) -> Bool)
100 -> Int
101 -> Int
102 -> Cmd err (Map NgramsType [NgramsElement])
103
104 buildNgramsTermsList' uCid groupIt stop gls is = do
105 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
106
107 let
108 (stops, candidates) = partitionEithers
109 $ map (\t -> if stop t then Left t else Right t)
110 $ Map.toList
111 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
112
113 (maps, candidates') = takeScored gls is
114 $ getCoocByNgrams' snd (Diagonal True)
115 $ Map.fromList candidates
116
117
118 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
119
120 (s,c,m) = (stops
121 , List.filter (\(k,_) -> List.elem k candidates') candidates
122 , List.filter (\(k,_) -> List.elem k maps) candidates
123 )
124
125 let ngs' = List.concat
126 $ map toNgramsElement
127 $ map (\t -> (StopTerm , toList' t)) s
128 <> map (\t -> (CandidateTerm, toList' t)) c
129 <> map (\t -> (GraphTerm , toList' t)) m
130
131 pure $ Map.fromList [(NgramsTerms, ngs')]
132 -}
133
134
135
136
137 buildNgramsTermsList :: Lang
138 -> Int
139 -> Int
140 -> StopSize
141 -> UserCorpusId
142 -> MasterCorpusId
143 -> Cmd err (Map NgramsType [NgramsElement])
144 buildNgramsTermsList l n m s uCid mCid = do
145 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
146
147 let
148 candidatesSize = 400
149 {-
150 a = 50
151 b = 50
152 -}
153 candidatesHead = List.take candidatesSize candidates
154 candidatesTail = List.drop candidatesSize candidates
155
156 termList =
157 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
158 (map (toGargList ((isStopTerm s) .fst) GraphTerm) candidatesHead)
159 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
160
161 ngs = List.concat $ map toNgramsElement termList
162
163 pure $ Map.fromList [(NgramsTerms, ngs)]
164
165
166 toTermList :: Int
167 -> Int
168 -> (a -> Bool)
169 -> [a]
170 -> [(ListType, a)]
171 toTermList _ _ _ [] = []
172 toTermList a b stop ns = -- trace ("computing toTermList") $
173 map (toGargList stop CandidateTerm) xs
174 <> map (toGargList stop GraphTerm) ys
175 <> toTermList a b stop zs
176 where
177 xs = take a ns
178 xz = drop a ns
179
180 ys = take b xz
181 zs = drop b xz
182
183
184 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
185 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
186 case Set.toList setNgrams of
187 [] -> []
188 (parent:children) -> [parentElem] <> childrenElems
189 where
190 parentElem = mkNgramsElement parent
191 listType
192 Nothing
193 (mSetFromList children)
194 childrenElems = map (\t -> mkNgramsElement t listType
195 (Just $ RootParent parent parent)
196 (mSetFromList [])
197 ) children
198
199
200 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
201 toGargList stop l n = case stop n of
202 True -> (StopTerm, n)
203 False -> (l, n)
204
205
206
207 isStopTerm :: StopSize -> Text -> Bool
208 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
209 where
210 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)