diff options
| -rw-r--r-- | BDA/.gitignore | 2 | ||||
| -rw-r--r-- | BDA/CM2.md | 12 | ||||
| -rw-r--r-- | BDA/TP2.md | 18 | ||||
| -rw-r--r-- | BDA/tp2.py | 38 |
4 files changed, 70 insertions, 0 deletions
diff --git a/BDA/.gitignore b/BDA/.gitignore new file mode 100644 index 0000000..bdf79f7 --- /dev/null +++ b/BDA/.gitignore @@ -0,0 +1,2 @@ +bda_datasets/ +mrjob/ diff --git a/BDA/CM2.md b/BDA/CM2.md new file mode 100644 index 0000000..a5a1b8d --- /dev/null +++ b/BDA/CM2.md @@ -0,0 +1,12 @@ +# MapReduce +- Map +- Reduce + +## Combiner +-> Reduce **doit** être associative ou commutative (ex : pas de moyenne) + +## YARN +- Il existe +- Permet de gérer les jobs à notre place +## MRJob +`pip install mrjob`
\ No newline at end of file diff --git a/BDA/TP2.md b/BDA/TP2.md new file mode 100644 index 0000000..f35f752 --- /dev/null +++ b/BDA/TP2.md @@ -0,0 +1,18 @@ +# 1. Histogramme + +mapper (ligne): + renvoie (ville, int(log(pop))) +reducer (paires): + pour chaque paire: + res[clé1] += 1 + +# 2. LastFM +## top decouvertes +mapper ligne renvoie (User, 1) +reducer1 prend [(User1, 1), (User2, 1), (User1, 1), ...] renvoie [(User1, nbUser1), (User2, nbUser2), ...] +reducer2 prend out reducer1 renvoie (key, max(Values)) + +## top ecoutes +mapper ligne renvoie (Artiste, 1) +reducer1 prend [(Artiste1, 1), (Artiste2, 1), (Artiste1, 1), ...] renvoie [(Artiste1, nbArtiste1), (Artiste2, nbArtiste2), ...] +reducer2 prend out reducer1 renvoie (key, max(Values))
\ No newline at end of file diff --git a/BDA/tp2.py b/BDA/tp2.py new file mode 100644 index 0000000..6b5b00e --- /dev/null +++ b/BDA/tp2.py @@ -0,0 +1,38 @@ +from mrjob.job import MRJob +from mrjob.step import MRStep +import re + +WORD_RE = re.compile(r"[\w']+") + +class MRWordFrequencyCount(MRJob): + + def mapper(self, _, line): + for word in WORD_RE.findall(line): + yield word, 1 + def steps(self): + return [ + MRStep(mapper=self.mapper, + reducer=self.reducer_count_words), + MRStep(reducer=self.reducer_find_max_word) + ] + + def reducer_count_words(self, key, values): + yield None, (sum(values), key) + def reducer_find_max_word(self, _, kvp): + yield max(kvp) + +class MRLongestWord(MRJob): + + def mapper(self, _, line): + for word in WORD_RE.findall(line): + yield word, len(word) + + def combiner(self, word, len_word): + yield None, (len_word[0], word) + + def reducer(self, _, len_word): + yield max(len_word) + +if __name__ == '__main__': +#MRWordFrequencyCount.run() + MRLongestWord.run() |
