summaryrefslogtreecommitdiff
path: root/BDA
diff options
context:
space:
mode:
Diffstat (limited to 'BDA')
-rw-r--r--BDA/.gitignore2
-rw-r--r--BDA/CM2.md12
-rw-r--r--BDA/TP2.md18
-rw-r--r--BDA/tp2.py38
4 files changed, 70 insertions, 0 deletions
diff --git a/BDA/.gitignore b/BDA/.gitignore
new file mode 100644
index 0000000..bdf79f7
--- /dev/null
+++ b/BDA/.gitignore
@@ -0,0 +1,2 @@
+bda_datasets/
+mrjob/
diff --git a/BDA/CM2.md b/BDA/CM2.md
new file mode 100644
index 0000000..a5a1b8d
--- /dev/null
+++ b/BDA/CM2.md
@@ -0,0 +1,12 @@
+# MapReduce
+- Map
+- Reduce
+
+## Combiner
+-> Reduce **doit** être associative ou commutative (ex : pas de moyenne)
+
+## YARN
+- Il existe
+- Permet de gérer les jobs à notre place
+## MRJob
+`pip install mrjob` \ No newline at end of file
diff --git a/BDA/TP2.md b/BDA/TP2.md
new file mode 100644
index 0000000..f35f752
--- /dev/null
+++ b/BDA/TP2.md
@@ -0,0 +1,18 @@
+# 1. Histogramme
+
+mapper (ligne):
+ renvoie (ville, int(log(pop)))
+reducer (paires):
+ pour chaque paire:
+ res[clé1] += 1
+
+# 2. LastFM
+## top decouvertes
+mapper ligne renvoie (User, 1)
+reducer1 prend [(User1, 1), (User2, 1), (User1, 1), ...] renvoie [(User1, nbUser1), (User2, nbUser2), ...]
+reducer2 prend out reducer1 renvoie (key, max(Values))
+
+## top ecoutes
+mapper ligne renvoie (Artiste, 1)
+reducer1 prend [(Artiste1, 1), (Artiste2, 1), (Artiste1, 1), ...] renvoie [(Artiste1, nbArtiste1), (Artiste2, nbArtiste2), ...]
+reducer2 prend out reducer1 renvoie (key, max(Values)) \ No newline at end of file
diff --git a/BDA/tp2.py b/BDA/tp2.py
new file mode 100644
index 0000000..6b5b00e
--- /dev/null
+++ b/BDA/tp2.py
@@ -0,0 +1,38 @@
+from mrjob.job import MRJob
+from mrjob.step import MRStep
+import re
+
+WORD_RE = re.compile(r"[\w']+")
+
+class MRWordFrequencyCount(MRJob):
+
+ def mapper(self, _, line):
+ for word in WORD_RE.findall(line):
+ yield word, 1
+ def steps(self):
+ return [
+ MRStep(mapper=self.mapper,
+ reducer=self.reducer_count_words),
+ MRStep(reducer=self.reducer_find_max_word)
+ ]
+
+ def reducer_count_words(self, key, values):
+ yield None, (sum(values), key)
+ def reducer_find_max_word(self, _, kvp):
+ yield max(kvp)
+
+class MRLongestWord(MRJob):
+
+ def mapper(self, _, line):
+ for word in WORD_RE.findall(line):
+ yield word, len(word)
+
+ def combiner(self, word, len_word):
+ yield None, (len_word[0], word)
+
+ def reducer(self, _, len_word):
+ yield max(len_word)
+
+if __name__ == '__main__':
+#MRWordFrequencyCount.run()
+ MRLongestWord.run()