]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
Merge branch 'dev' into dev-graph-explorer-gexf
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Text.List
18 where
19
20 import Data.Either (partitionEithers, Either(..))
21 -- import Debug.Trace (trace)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
29 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Database.Utils (Cmd)
32 import Gargantext.Text.List.Learn (Model(..))
33 import Gargantext.Text.Metrics (takeScored)
34 import Gargantext.Prelude
35 --import Gargantext.Text.Terms (TermType(..))
36 import qualified Data.Char as Char
37 import qualified Data.List as List
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41
42
43 data NgramsListBuilder = BuilderStepO { stemSize :: Int
44 , stemX :: Int
45 , stopSize :: Int
46 }
47 | BuilderStep1 { withModel :: Model }
48 | BuilderStepN { withModel :: Model }
49 | Tficf { nlb_lang :: Lang
50 , nlb_group1 :: Int
51 , nlb_group2 :: Int
52 , nlb_stopSize :: StopSize
53 , nlb_userCorpusId :: UserCorpusId
54 , nlb_masterCorpusId :: MasterCorpusId
55 }
56
57
58 data StopSize = StopSize {unStopSize :: Int}
59
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
62 -> Cmd err (Map NgramsType [NgramsElement])
63 buildNgramsLists l n m s uCid mCid = do
64 ngTerms <- buildNgramsTermsList l n m s uCid mCid
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
67
68
69 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
70 -> Cmd err (Map NgramsType [NgramsElement])
71 buildNgramsOthersList uCid groupIt nt = do
72 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
73
74 let
75 listSize = 9
76 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
77 graphTerms = List.take listSize all'
78 candiTerms = List.drop listSize all'
79 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
80 , toElements CandidateTerm candiTerms]
81 where
82 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
83 | (t,_ns) <- x
84 ]
85 )
86 ]
87
88 --{-
89 buildNgramsTermsList' :: UserCorpusId
90 -> (Text -> Text)
91 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
92 -> Cmd err (Map NgramsType [NgramsElement])
93 --}
94 buildNgramsTermsList' uCid groupIt stop gls is = do
95 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
96
97 let
98 (stops, candidates) = partitionEithers
99 $ map (\t -> if stop t then Left t else Right t)
100 $ Map.toList
101 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
102
103 (maps, candidates') = takeScored gls is
104 $ getCoocByNgrams' snd (Diagonal True)
105 $ Map.fromList candidates
106
107
108 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
109
110 (s,c,m) = (stops
111 , List.filter (\(k,_) -> List.elem k candidates') candidates
112 , List.filter (\(k,_) -> List.elem k maps) candidates
113 )
114
115 let ngs' = List.concat
116 $ map toNgramsElement
117 $ map (\t -> (StopTerm, toList' t)) s
118 <> map (\t -> (CandidateTerm, toList' t)) c
119 <> map (\t -> (GraphTerm, toList' t)) m
120
121 pure $ Map.fromList [(NgramsTerms, ngs')]
122
123
124 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
125 -> Cmd err (Map NgramsType [NgramsElement])
126 buildNgramsTermsList l n m s uCid mCid = do
127 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
128 let
129 candidatesSize = 2000
130 a = 10
131 b = 10
132 candidatesHead = List.take candidatesSize candidates
133 candidatesTail = List.drop candidatesSize candidates
134 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
135 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
136 let ngs = List.concat $ map toNgramsElement termList
137
138 pure $ Map.fromList [(NgramsTerms, ngs)]
139
140
141 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
142 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
143 case Set.toList setNgrams of
144 [] -> []
145 (parent:children) -> [parentElem] <> childrenElems
146 where
147 parentElem = mkNgramsElement parent
148 listType
149 Nothing
150 (mSetFromList children)
151 childrenElems = map (\t -> mkNgramsElement t listType
152 (Just $ RootParent parent parent)
153 (mSetFromList [])
154 ) children
155
156
157 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
158 toList stop l n = case stop n of
159 True -> (StopTerm, n)
160 False -> (l, n)
161
162
163 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
164 toTermList _ _ _ [] = []
165 toTermList a b stop ns = -- trace ("computing toTermList") $
166 map (toList stop CandidateTerm) xs
167 <> map (toList stop GraphTerm) ys
168 <> toTermList a b stop zs
169 where
170 xs = take a ns
171 ta = drop a ns
172
173 ys = take b ta
174 zs = drop b ta
175
176
177 isStopTerm :: StopSize -> Text -> Bool
178 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
179 where
180 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)