]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[CLEAN] metrics
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Text.List
18 where
19
20 import Data.Either (partitionEithers, Either(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Admin.Utils (Cmd)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Prelude
32 import Gargantext.Text.List.Learn (Model(..))
33 import Gargantext.Text.Metrics (takeScored)
34 import qualified Data.Char as Char
35 import qualified Data.List as List
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39
40
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 , stemX :: Int
43 , stopSize :: Int
44 }
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
48 , nlb_group1 :: Int
49 , nlb_group2 :: Int
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
53 }
54
55
56 data StopSize = StopSize {unStopSize :: Int}
57
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
60 -> Int
61 -> Int
62 -> StopSize
63 -> UserCorpusId
64 -> MasterCorpusId
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
70
71
72 buildNgramsOthersList :: UserCorpusId
73 -> (Text -> Text)
74 -> NgramsType
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78
79 let
80 listSize = 9
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
82 graphTerms = List.take listSize all'
83 candiTerms = List.drop listSize all'
84 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
85 , toElements CandidateTerm candiTerms]
86 where
87 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
88 | (t,_ns) <- x
89 ]
90 )
91 ]
92
93 {-
94 buildNgramsTermsList' :: UserCorpusId
95 -> (Text -> Text)
96 -> ((Text, (Set Text, Set NodeId)) -> Bool)
97 -> Int
98 -> Int
99 -> Cmd err (Map NgramsType [NgramsElement])
100
101 buildNgramsTermsList' uCid groupIt stop gls is = do
102 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
103
104 let
105 (stops, candidates) = partitionEithers
106 $ map (\t -> if stop t then Left t else Right t)
107 $ Map.toList
108 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
109
110 (maps, candidates') = takeScored gls is
111 $ getCoocByNgrams' snd (Diagonal True)
112 $ Map.fromList candidates
113
114
115 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
116
117 (s,c,m) = (stops
118 , List.filter (\(k,_) -> List.elem k candidates') candidates
119 , List.filter (\(k,_) -> List.elem k maps) candidates
120 )
121
122 let ngs' = List.concat
123 $ map toNgramsElement
124 $ map (\t -> (StopTerm, toList' t)) s
125 <> map (\t -> (CandidateTerm, toList' t)) c
126 <> map (\t -> (GraphTerm, toList' t)) m
127
128 pure $ Map.fromList [(NgramsTerms, ngs')]
129 -}
130
131 buildNgramsTermsList :: Lang
132 -> Int
133 -> Int
134 -> StopSize
135 -> UserCorpusId
136 -> MasterCorpusId
137 -> Cmd err (Map NgramsType [NgramsElement])
138 buildNgramsTermsList l n m s uCid mCid = do
139 candidates <- sortTficf <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
140
141 let
142 candidatesSize = 2000
143
144 a = 10
145 b = 10
146
147 candidatesHead = List.take candidatesSize candidates
148 candidatesTail = List.drop candidatesSize candidates
149
150 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
151 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
152
153 ngs = List.concat $ map toNgramsElement termList
154
155 pure $ Map.fromList [(NgramsTerms, ngs)]
156
157
158 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
159 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
160 case Set.toList setNgrams of
161 [] -> []
162 (parent:children) -> [parentElem] <> childrenElems
163 where
164 parentElem = mkNgramsElement parent
165 listType
166 Nothing
167 (mSetFromList children)
168 childrenElems = map (\t -> mkNgramsElement t listType
169 (Just $ RootParent parent parent)
170 (mSetFromList [])
171 ) children
172
173
174 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
175 toList stop l n = case stop n of
176 True -> (StopTerm, n)
177 False -> (l, n)
178
179
180 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
181 toTermList _ _ _ [] = []
182 toTermList a b stop ns = -- trace ("computing toTermList") $
183 map (toList stop CandidateTerm) xs
184 <> map (toList stop GraphTerm) ys
185 <> toTermList a b stop zs
186 where
187 xs = take a ns
188 ta = drop a ns
189
190 ys = take b ta
191 zs = drop b ta
192
193
194 isStopTerm :: StopSize -> Text -> Bool
195 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
196 where
197 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)