]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
no logs
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Either (partitionEithers, Either(..))
20 import Debug.Trace (trace)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
30 import Gargantext.Database.Utils (Cmd)
31 import Gargantext.Text.List.Learn (Model(..))
32 import Gargantext.Text.Metrics (takeScored)
33 import Gargantext.Prelude
34 --import Gargantext.Text.Terms (TermType(..))
35 import qualified Data.Char as Char
36 import qualified Data.List as List
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
39 import qualified Data.Text as Text
40
41
42 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 , stemX :: Int
44 , stopSize :: Int
45 }
46 | BuilderStep1 { withModel :: Model }
47 | BuilderStepN { withModel :: Model }
48 | Tficf { nlb_lang :: Lang
49 , nlb_group1 :: Int
50 , nlb_group2 :: Int
51 , nlb_stopSize :: StopSize
52 , nlb_userCorpusId :: UserCorpusId
53 , nlb_masterCorpusId :: MasterCorpusId
54 }
55
56
57 data StopSize = StopSize {unStopSize :: Int}
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
61 -> Cmd err (Map NgramsType [NgramsElement])
62 buildNgramsLists l n m s uCid mCid = do
63 ngTerms <- buildNgramsTermsList l n m s uCid mCid
64 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
65 pure $ Map.unions $ othersTerms <> [ngTerms]
66
67
68 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
69 -> Cmd err (Map NgramsType [NgramsElement])
70 buildNgramsOthersList uCid groupIt nt = do
71 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
72
73 let
74 listSize = 9
75 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
76 graphTerms = List.take listSize all'
77 candiTerms = List.drop listSize all'
78 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
79 , toElements CandidateTerm candiTerms]
80 where
81 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
82 | (t,_ns) <- x
83 ]
84 )
85 ]
86
87 --{-
88 buildNgramsTermsList' :: UserCorpusId
89 -> (Text -> Text)
90 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
91 -> Cmd err (Map NgramsType [NgramsElement])
92 --}
93 buildNgramsTermsList' uCid groupIt stop gls is = do
94 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
95
96 let
97 (stops, candidates) = partitionEithers
98 $ map (\t -> if stop t then Left t else Right t)
99 $ Map.toList
100 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
101
102 (maps, candidates') = takeScored gls is
103 $ getCoocByNgrams' snd (Diagonal True)
104 $ Map.fromList candidates
105
106
107 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
108
109 (s,c,m) = (stops
110 , List.filter (\(k,_) -> List.elem k candidates') candidates
111 , List.filter (\(k,_) -> List.elem k maps) candidates
112 )
113
114 let ngs' = List.concat
115 $ map toNgramsElement
116 $ map (\t -> (StopTerm, toList' t)) s
117 <> map (\t -> (CandidateTerm, toList' t)) c
118 <> map (\t -> (GraphTerm, toList' t)) m
119
120 pure $ Map.fromList [(NgramsTerms, ngs')]
121
122
123 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
124 -> Cmd err (Map NgramsType [NgramsElement])
125 buildNgramsTermsList l n m s uCid mCid = do
126 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
127 let
128 candidatesSize = 2000
129 a = 10
130 b = 10
131 candidatesHead = List.take candidatesSize candidates
132 candidatesTail = List.drop candidatesSize candidates
133 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
134 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
135 let ngs = List.concat $ map toNgramsElement termList
136
137 pure $ Map.fromList [(NgramsTerms, ngs)]
138
139
140 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
141 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
142 case Set.toList setNgrams of
143 [] -> []
144 (parent:children) -> [parentElem] <> childrenElems
145 where
146 parentElem = mkNgramsElement parent
147 listType
148 Nothing
149 (mSetFromList children)
150 childrenElems = map (\t -> mkNgramsElement t listType
151 (Just $ RootParent parent parent)
152 (mSetFromList [])
153 ) children
154
155
156 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
157 toList stop l n = case stop n of
158 True -> (StopTerm, n)
159 False -> (l, n)
160
161
162 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
163 toTermList _ _ _ [] = []
164 toTermList a b stop ns = trace ("computing toTermList") $
165 map (toList stop CandidateTerm) xs
166 <> map (toList stop GraphTerm) ys
167 <> toTermList a b stop zs
168 where
169 xs = take a ns
170 ta = drop a ns
171
172 ys = take b ta
173 zs = drop b ta
174
175
176 isStopTerm :: StopSize -> Text -> Bool
177 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
178 where
179 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)