]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[MERGE] fix warnings
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import qualified Data.Char as Char
21 import qualified Data.List as List
22 import qualified Data.Map as Map
23 import qualified Data.Set as Set
24 import qualified Data.Text as Text
25
26 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
27 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
28 import Gargantext.Core (Lang(..))
29 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
30 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
31 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
32 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
33 import Gargantext.Database.Prelude (Cmd)
34 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
35
36 import Gargantext.Prelude
37 import Gargantext.Core.Text (size)
38 import Gargantext.Core.Text.List.Learn (Model(..))
39 -- import Gargantext.Core.Text.Metrics (takeScored)
40
41
42 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 , stemX :: Int
44 , stopSize :: Int
45 }
46 | BuilderStep1 { withModel :: Model }
47 | BuilderStepN { withModel :: Model }
48 | Tficf { nlb_lang :: Lang
49 , nlb_group1 :: Int
50 , nlb_group2 :: Int
51 , nlb_stopSize :: StopSize
52 , nlb_userCorpusId :: UserCorpusId
53 , nlb_masterCorpusId :: MasterCorpusId
54 }
55
56
57 data StopSize = StopSize {unStopSize :: Int}
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: Lang
61 -> Int
62 -> Int
63 -> StopSize
64 -> UserCorpusId
65 -> MasterCorpusId
66 -> Cmd err (Map NgramsType [NgramsElement])
67 buildNgramsLists l n m s uCid mCid = do
68 ngTerms <- buildNgramsTermsList l n m s uCid mCid
69 othersTerms <- mapM (buildNgramsOthersList uCid identity)
70 [Authors, Sources, Institutes]
71 pure $ Map.unions $ othersTerms <> [ngTerms]
72
73
74 buildNgramsOthersList :: UserCorpusId
75 -> (Text -> Text)
76 -> NgramsType
77 -> Cmd err (Map NgramsType [NgramsElement])
78 buildNgramsOthersList uCid groupIt nt = do
79 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
80
81 let
82 listSize = 9
83 all' = List.reverse
84 $ List.sortOn (Set.size . snd . snd)
85 $ Map.toList ngs
86
87 graphTerms = List.take listSize all'
88 candiTerms = List.drop listSize all'
89
90 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
91 , toElements CandidateTerm candiTerms
92 ]
93 where
94 toElements nType x =
95 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
96 | (t, _ns) <- x
97 ]
98 )]
99
100 buildNgramsTermsList :: Lang
101 -> Int
102 -> Int
103 -> StopSize
104 -> UserCorpusId
105 -> MasterCorpusId
106 -> Cmd err (Map NgramsType [NgramsElement])
107 buildNgramsTermsList l n m s uCid mCid = do
108 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
109 -- printDebug "head candidates" (List.take 10 $ candidates)
110 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
111
112 let
113 listSize = 400 :: Double
114 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
115
116 (mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
117 (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
118 (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
119
120 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
121 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead))
122 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail))
123
124 ngs = List.concat
125 $ map toNgramsElement
126 $ groupStems
127 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
128 , GroupedText listType t d Set.empty
129 )
130 ) termList
131
132 pure $ Map.fromList [(NgramsTerms, ngs)]
133
134 type Group = Lang -> Int -> Int -> Text -> Text
135 type Stem = Text
136 type Label = Text
137 data GroupedText = GroupedText { _gt_listType :: ListType
138 , _gt_label :: Label
139 , _gt_score :: Double
140 , _gt_group :: Set Text
141 }
142 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
143 groupStems = Map.elems . Map.fromListWith grouping
144 where
145 grouping (GroupedText lt1 label1 score1 group1)
146 (GroupedText lt2 label2 score2 group2)
147 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
148 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
149 where
150 lt = lt1 <> lt2
151 gr = Set.union group1 group2
152
153 toNgramsElement :: GroupedText -> [NgramsElement]
154 toNgramsElement (GroupedText listType label _ setNgrams) =
155 [parentElem] <> childrenElems
156 where
157 parent = label
158 children = Set.toList setNgrams
159 parentElem = mkNgramsElement (NgramsTerm parent)
160 listType
161 Nothing
162 (mSetFromList (NgramsTerm <$> children))
163 childrenElems = map (\t -> mkNgramsElement t listType
164 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
165 (mSetFromList [])
166 ) (NgramsTerm <$> children)
167
168
169 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
170 toGargList isStop l n = case isStop n of
171 True -> (StopTerm, n)
172 False -> (l, n)
173
174
175 isStopTerm :: StopSize -> Text -> Bool
176 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
177 where
178 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)