{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Download from: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import string\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import transformers\n", "import pandas as pd\n", "import json\n", "import seaborn" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reviewsentiment
0One of the other reviewers has mentioned that ...positive
1A wonderful little production. <br /><br />The...positive
2I thought this was a wonderful way to spend ti...positive
3Basically there's a family where a little boy ...negative
4Petter Mattei's \"Love in the Time of Money\" is...positive
\n", "
" ], "text/plain": [ " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"./imdb_data.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "source_lengths = []\n", "target_lengths = []\n", "\n", "tokenizer = transformers.AutoTokenizer.from_pretrained(\"t5-large\", cache_dir=\"/workspace/cache\")\n", "\n", "def get_len(text):\n", " return len(tokenizer.encode(text))\n", "\n", "label_set = [\"positive\", \"negative\"]\n", "\n", "def create_pair(s, t, split):\n", " prefix = \"\"\n", " s = s.replace(\"

\", \" \")\n", " line = {\n", " \"translation\": {\n", " \"s\": prefix+s,\n", " \"t\": t\n", " }\n", " }\n", " source_lengths.append(get_len(s))\n", " target_lengths.append(get_len(t))\n", " with open(\"./\"+split+\"lines.json\", 'a+') as outfile:\n", " json.dump(line, outfile)\n", " outfile.write(\"\\n\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "x_list = []\n", "y_list = []\n", "\n", "count = 0\n", "\n", "for index, row in df.iterrows():\n", " s = row[\"review\"].split(\" \")[:256]\n", " s = \" \".join(s).strip()\n", " t = row[\"sentiment\"].strip()\n", " x_list.append(s)\n", " y_list.append(t)\n", " count += 1\n", " if count > 10000:\n", " break" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "x_train, x_valid, y_train, y_valid = train_test_split(x_list, y_list, test_size=0.2, stratify=y_list)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "split = \"train\"\n", "for s, t in zip(x_train, y_train):\n", " create_pair(s, t, split)\n", " \n", "split = \"valid\"\n", "for s, t in zip(x_valid, y_valid):\n", " create_pair(s, t, split)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "seaborn.displot(source_lengths)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAATcUlEQVR4nO3df6zd9X3f8eerOORH88Mm8RC1jWAL/UGzRaEOoRBVaZjA0G6mE6G0UbAQqyuFZck2dfkxaUhJIyVS1aRsC50FLFBFIYyyQlYKcoEkm1JInIRBgKZYRGA7JDiYkC5Rmzl574/7cXvrXtvHvvee973c50O6ut/z+X7POZ+Pojzzzdffc26qCknS9P1Y9wQkaaUywJLUxABLUhMDLElNDLAkNVnVPYFp27RpU915553d05C0smSuwRV3Bvztb3+7ewqSBKzAAEvSUmGAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCaLFuAk1yd5OslXZ42dkGR7ksfG7zVjPEmuTrIzyYNJzpj1nC3j+MeSbJk1/nNJHhrPuTrJnF92IUlL1WKeAX8c2HTQ2HuAu6vqNODu8RjgAuC08bMVuAZmgg1cBbwBOBO46kC0xzG/Met5B7+XJC1pixbgqvocsO+g4c3ADWP7BuCiWeM31oz7gNVJTgLOB7ZX1b6qehbYDmwa+15eVffVzF8VvXHWa0nSsjDt7wM+saqeGtvfBE4c2+uAXbOO2z3GDje+e47xOSXZysyZNSeffPJRT3rdhpP5xu5dRz5Q0vPeT6zfwJ5dTy7Ia7V9IXtVVZKa0nttA7YBbNy48ajf8xu7d/Gr//XzCz4vScvPp37z7AV7rWnfBfGtcfmA8fvpMb4H2DDruPVj7HDj6+cYl6RlY9oBvh04cCfDFuC2WeOXjbshzgKeG5cq7gLOS7Jm/OPbecBdY993k5w17n64bNZrSdKysGiXIJJ8EngT8Koku5m5m+FDwM1JrgCeAC4Zh98BXAjsBL4PXA5QVfuSfAD44jju/VV14B/23s7MnRYvBv5k/EjSsrFoAa6qXzvErnPnOLaAKw/xOtcD188xvgN4zXzmKEmd/CScJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNWgKc5N8keTjJV5N8MsmLkpya5P4kO5N8Ksnx49gXjsc7x/5TZr3Oe8f415Kc37EWSTpWUw9wknXAvwY2VtVrgOOAS4EPAx+pqlcDzwJXjKdcATw7xj8yjiPJ6eN5PwtsAj6W5LhprkWS5qPrEsQq4MVJVgEvAZ4C3gzcMvbfAFw0tjePx4z95ybJGL+pqv66qr4O7ATOnM70JWn+ph7gqtoD/A7wJDPhfQ74EvCdqto/DtsNrBvb64Bd47n7x/GvnD0+x3P+jiRbk+xIsmPv3r0LuyBJOkYdlyDWMHP2eirwE8CPM3MJYdFU1baq2lhVG9euXbuYbyVJE+u4BPFPga9X1d6q+n/ArcA5wOpxSQJgPbBnbO8BNgCM/a8Anpk9PsdzJGnJ6wjwk8BZSV4yruWeCzwC3AtcPI7ZAtw2tm8fjxn776mqGuOXjrskTgVOA74wpTVI0rytOvIhC6uq7k9yC/BlYD/wFWAb8MfATUl+e4xdN55yHfAHSXYC+5i584GqejjJzczEez9wZVX9cKqLkaR5mHqAAarqKuCqg4YfZ467GKrqr4C3HOJ1Pgh8cMEnKElT4CfhJKmJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElq0hLgJKuT3JLkz5M8muTnk5yQZHuSx8bvNePYJLk6yc4kDyY5Y9brbBnHP5ZkS8daJOlYdZ0B/x5wZ1X9NPBa4FHgPcDdVXUacPd4DHABcNr42QpcA5DkBOAq4A3AmcBVB6ItScvB1AOc5BXALwDXAVTVD6rqO8Bm4IZx2A3ARWN7M3BjzbgPWJ3kJOB8YHtV7auqZ4HtwKapLUSS5qnjDPhUYC/w35J8Jcm1SX4cOLGqnhrHfBM4cWyvA3bNev7uMXao8b8nydYkO5Ls2Lt37wIuRZKOXUeAVwFnANdU1euA7/G3lxsAqKoCaqHesKq2VdXGqtq4du3ahXpZSZqXjgDvBnZX1f3j8S3MBPlb49IC4/fTY/8eYMOs568fY4cal6RlYeoBrqpvAruS/NQYOhd4BLgdOHAnwxbgtrF9O3DZuBviLOC5caniLuC8JGvGP76dN8YkaVlY1fS+7wA+keR44HHgcmb+x+DmJFcATwCXjGPvAC4EdgLfH8dSVfuSfAD44jju/VW1b3pLkKT5aQlwVT0AbJxj17lzHFvAlYd4neuB6xd0cpI0JX4STpKaGGBJamKAJamJAZakJgZYkppMFOAk50wyJkma3KRnwP9pwjFJ0oQOex9wkp8HzgbWJvm3s3a9HDhuMScmSc93R/ogxvHAS8dxL5s1/l3g4sWalCStBIcNcFV9Fvhsko9X1RNTmpMkrQiTfhT5hUm2AafMfk5VvXkxJiVJK8GkAf7vwO8D1wI/XLzpSNLKMWmA91fVNYs6E0laYSa9De3TSd6e5KTx14tPGH8UU5J0jCY9Az7wRem/NWusgH+4sNORpJVjogBX1amLPRFJWmkmCnCSy+Yar6obF3Y6krRyTHoJ4vWztl/EzF+u+DJggCXpGE16CeIdsx8nWQ3ctBgTkqSV4li/jvJ7gNeFJWkeJr0G/Glm7nqAmS/h+Rng5sWalCStBJNeA/6dWdv7gSeqavcizEeSVoyJLkGML+X5c2a+EW0N8IPFnJQkrQST/kWMS4AvAG8BLgHuT+LXUUrSPEx6CeI/AK+vqqcBkqwF/hS4ZbEmJknPd5PeBfFjB+I7PHMUz5UkzWHSM+A7k9wFfHI8/lXgjsWZkiStDEf6m3CvBk6sqt9K8i+AN45dfwZ8YrEnJ0nPZ0c6A/4o8F6AqroVuBUgyT8e+/7ZIs5Nkp7XjnQd98SqeujgwTF2yqLMSJJWiCMFePVh9r14AechSSvOkQK8I8lvHDyY5F8CX1qcKUnSynCka8DvAv5Hkrfyt8HdCBwP/MoizkuSnvcOG+Cq+hZwdpJfBF4zhv+4qu5Z9JlJ0vPcpN8HfC9w7yLPRZJWFD/NJklNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU3aApzkuCRfSfI/x+NTk9yfZGeSTyU5foy/cDzeOfafMus13jvGv5bk/KalSNIx6TwDfifw6KzHHwY+UlWvBp4FrhjjVwDPjvGPjONIcjpwKfCzwCbgY0mOm9LcJWneWgKcZD3wS8C143GANwO3jENuAC4a25vHY8b+c8fxm4Gbquqvq+rrwE7gzKksQJIWQNcZ8EeBfw/8aDx+JfCdqto/Hu8G1o3tdcAugLH/uXH834zP8Zy/I8nWJDuS7Ni7d+8CLkOSjt3UA5zkl4Gnq+pL03rPqtpWVRurauPatWun9baSdFgT/Vn6BXYO8M+TXAi8CHg58HvA6iSrxlnuemDPOH4PsAHYnWQV8ArgmVnjB8x+jiQteVM/A66q91bV+qo6hZl/RLunqt4K3AtcPA7bAtw2tm8fjxn776mqGuOXjrskTgVOA74wpWVI0rx1nAEfyruBm5L8NvAV4Loxfh3wB0l2AvuYiTZV9XCSm4FHgP3AlVX1w+lPW5KOTWuAq+ozwGfG9uPMcRdDVf0V8JZDPP+DwAcXb4aStHj8JJwkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU0MsCQ1McCS1MQAS1ITAyxJTQywJDUxwJLUxABLUhMDLElNDLAkNTHAktTEAEtSEwMsSU2mHuAkG5Lcm+SRJA8neecYPyHJ9iSPjd9rxniSXJ1kZ5IHk5wx67W2jOMfS7Jl2muRpPnoOAPeD/y7qjodOAu4MsnpwHuAu6vqNODu8RjgAuC08bMVuAZmgg1cBbwBOBO46kC0JWk5mHqAq+qpqvry2P5L4FFgHbAZuGEcdgNw0djeDNxYM+4DVic5CTgf2F5V+6rqWWA7sGl6K5Gk+Wm9BpzkFOB1wP3AiVX11Nj1TeDEsb0O2DXrabvH2KHGJWlZaAtwkpcCfwi8q6q+O3tfVRVQC/heW5PsSLJj7969C/WykjQvLQFO8gJm4vuJqrp1DH9rXFpg/H56jO8BNsx6+voxdqjxv6eqtlXVxqrauHbt2oVbiCTNQ8ddEAGuAx6tqt+dtet24MCdDFuA22aNXzbuhjgLeG5cqrgLOC/JmvGPb+eNMUlaFlY1vOc5wNuAh5I8MMbeB3wIuDnJFcATwCVj3x3AhcBO4PvA5QBVtS/JB4AvjuPeX1X7prICSVoAUw9wVf1vIIfYfe4cxxdw5SFe63rg+oWbnSRNj5+Ek6QmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCYGWJKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqYkBlqQmBliSmhhgSWpigCWpiQGWpCbLPsBJNiX5WpKdSd7TPR9JmtSyDnCS44D/AlwAnA78WpLTe2clSZNZ1gEGzgR2VtXjVfUD4CZgc/OcJGkiq7onME/rgF2zHu8G3nDwQUm2AlvHw/+b5GtH+0af+s2zj2mCR+lVwLen8UZT4nqWNtdzjJIc7VPurKpNBw8u9wBPpKq2Adu653EkSXZU1cbueSwU17O0uZ5+y/0SxB5gw6zH68eYJC15yz3AXwROS3JqkuOBS4Hbm+ckSRNZ1pcgqmp/kn8F3AUcB1xfVQ83T2s+lvxlkqPkepY219MsVdU9B0lakZb7JQhJWrYMsCQ1McBTluT6JE8n+ephjnlTkgeSPJzks9Oc39E60nqSvCLJp5P8n7Gey6c9x0kl2ZDk3iSPjLm+c45jkuTq8dH3B5Oc0THXSUy4nreOdTyU5PNJXtsx10lMsp5Zx74+yf4kF09zjketqvyZ4g/wC8AZwFcPsX818Ahw8nj8D7rnPM/1vA/48NheC+wDju+e9yHmehJwxth+GfAXwOkHHXMh8CdAgLOA+7vnPc/1nA2sGdsXLPf1jH3HAfcAdwAXd8/7cD+eAU9ZVX2OmQgdyq8Dt1bVk+P4p6cysWM0wXoKeFlmPjr00nHs/mnM7WhV1VNV9eWx/ZfAo8x82nK2zcCNNeM+YHWSk6Y81YlMsp6q+nxVPTse3sfMvfRL0oT/+QC8A/hDYEn/dwe8BLEU/SSwJslnknwpyWXdE5qn/wz8DPAN4CHgnVX1o94pHVmSU4DXAfcftGuuj7/PFYEl5TDrme0KZs7ul7xDrSfJOuBXgGsapnXUlvV9wM9Tq4CfA84FXgz8WZL7quoveqd1zM4HHgDeDPwjYHuS/1VV322d1WEkeSkzZ1DvWsrznNQk60nyi8wE+I3TnNuxOMJ6Pgq8u6p+dAzf1zB1Bnjp2Q08U1XfA76X5HPAa5m53rUcXQ58qGYuzu1M8nXgp4Ev9E5rbklewMx/uT9RVbfOcciy+vj7BOshyT8BrgUuqKpnpjm/ozXBejYCN434vgq4MMn+qvqj6c1ycl6CWHpuA96YZFWSlzDz7W6PNs9pPp5k5myeJCcCPwU83jqjQxjXqa8DHq2q3z3EYbcDl427Ic4Cnquqp6Y2yaMwyXqSnAzcCrxtqf+/rEnWU1WnVtUpVXUKcAvw9qUaX/AMeOqSfBJ4E/CqJLuBq4AXAFTV71fVo0nuBB4EfgRcW1WHvGWt25HWA3wA+HiSh5i5c+DdVbVUvwLxHOBtwENJHhhj7wNOhr9Zzx3M3AmxE/g+M2f4S9Uk6/mPwCuBj42zxv21dL9RbJL1LCt+FFmSmngJQpKaGGBJamKAJamJAZakJgZYkpoYYElqYoAlqcn/B8BZiWB9A6hZAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "seaborn.displot(target_lengths)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }