]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Multi/RAKE.hs
Merge branch 'tree-json'
[gargantext.git] / src / Gargantext / Text / Terms / Multi / RAKE.hs
1 {-|
2 Module : Gargantext.Text.Terms.Multi.RAKE
3 Description : Rapid automatic keyword extraction (RAKE)
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Personal notes for the integration of RAKE in Gargantext.
11
12 RAKE algorithm is a simple, rapid and effective algorithm to extract
13 keywords that is very sensitive to the quality of the stop word list.
14
15 Indeed, the very first step starts from the stop words list to cut the
16 text towards keywords extraction. The conTexT is the sentence level to
17 compute the coccurrences and occurrences which are divided to compute
18 the metric of one word. Multi-words metrics is equal to the sum of the
19 metrics of each word.
20
21 Finally The metrics highlight longer keywords which highly depends of
22 quality of the cut which depends on the quality of the stop word list.
23
24 As a consequence, to improve the effectiveness of RAKE algorithm, I am
25 wondering if some bayesian features could be added to increase stop word
26 list quality in time.
27
28 -}
29
30 {-# LANGUAGE NoImplicitPrelude #-}
31
32 module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)
33 where
34
35 import Data.Text (Text)
36 import NLP.RAKE.Text
37 import Gargantext.Prelude
38
39 multiterms_rake :: Text -> [WordScore]
40 multiterms_rake = candidates hardStopList
41 defaultNosplit
42 defaultNolist . pSplitter
43
44 -- | StopList
45 hardStopList :: StopwordsMap
46 hardStopList = mkStopwordsStr [
47 "a","a's","able","about","above","apply","according","accordingly",
48 "across","actually","after","afterwards","again","against",
49 "ain't","all","allow","allows","almost","alone","along",
50 "already","also","although","always","am","among","amongst",
51 "an","and","another","any","anybody","anyhow","anyone","anything",
52 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
53 "are","aren't","around","as","aside","ask","asking","associated","at",
54 "available","away","awfully","based", "b","be","became","because","become",
55 "becomes","becoming","been","before","beforehand","behind","being",
56 "believe","below","beside","besides","best","better","between","beyond",
57 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
58 "cant","cause","causes","certain","certainly","changes","clearly","co",
59 "com","come","comes","common","concerning","consequently","consider","considering",
60 "contain","containing","contains","corresponding","could","couldn't","course",
61 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
62 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
63 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
64 "especially","et","etc","even","ever","every","everybody","everyone",
65 "everything","everywhere","ex","exactly","example","except","f","far",
66 "few","find","fifth","first","five","followed","following","follows","for",
67 "former","formerly","forth","four","from","further","furthermore","g",
68 "get","gets","getting","given","gives","go","goes","going","gone","got",
69 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
70 "have","haven't","having","he","he's","hello","help","hence","her","here",
71 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
72 "him","himself","his","hither","hopefully","how","howbeit","however","i",
73 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
74 "inc","indeed","indicate","indicated","indicates","inner","insofar",
75 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
76 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
77 "last","lately","later","latter","latterly","least","less","lest","let",
78 "let's","like","liked","likely","little","look","looking","looks","ltd",
79 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
80 "more","moreover","most","mostly","much","must","my","myself","n",
81 "name","namely","nd","near","nearly","necessary","need","needs","neither",
82 "never","nevertheless","new","next","nine","no","nobody","non","none",
83 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
84 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
85 "ones","only","onto","or","other","others","otherwise","ought","our",
86 "ours","ourselves","out","outside","over","overall","own","p","particular",
87 "particularly","per","perhaps","placed","please","plus","possible",
88 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
89 "rd","re","really","reasonably","regarding","regardless","regards",
90 "relatively","respectively","right","s","said","same","saw","say",
91 "saying","says","second","secondly","see","seeing","seem","seemed",
92 "seeming","seems","seen","self","selves","sensible","sent","serious",
93 "seriously","seven","several","shall","she","should","shouldn't","since",
94 "six","so","some","somebody","somehow","someone","something","sometime",
95 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
96 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
97 "tell","tends","th","than","thank","thanks","thanx","that","that's",
98 "thats","the","their","theirs","them","themselves","then","thence","there",
99 "there's","thereafter","thereby","therefore","therein","theres",
100 "thereupon","these","they","they'd","they'll","they're","they've",
101 "think","third","this","thorough","thoroughly","those","though","three",
102 "through","throughout","thru","thus","to","together","too","took","toward",
103 "towards","tried","tries","truly","try","trying","twice","two","u","un",
104 "under","unfortunately","unless","unlikely","until","unto","up","upon",
105 "us","use","used","useful","uses","using","usually","uucp","v","value",
106 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
107 "we","we'd","we'll","we're","we've","welcome","well","went","were",
108 "weren't","what","what's","whatever","when","whence","whenever","where",
109 "where's","whereafter","whereas","whereby","wherein","whereupon",
110 "wherever","whether","which","while","whither","who","who's","whoever",
111 "whole","whom","whose","why","will","willing","wish","with","within",
112 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
113 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
114 "z","zero"]
115
116