Pietro Lesci commited on
Commit
9716e1f
1 Parent(s): dc4ad9e

fix blake3 (vaex dependency)

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. tests/notebook.ipynb +144 -1
requirements.txt CHANGED
@@ -6,6 +6,7 @@ xlrd==2.0.1
6
  openpyxl==3.0.9
7
  watchdog==2.1.6
8
  vaex==4.7.0
 
9
 
10
  # english
11
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
 
6
  openpyxl==3.0.9
7
  watchdog==2.1.6
8
  vaex==4.7.0
9
+ blake3==0.2.1 # to make vaex work
10
 
11
  # english
12
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
tests/notebook.ipynb CHANGED
@@ -12,7 +12,150 @@
12
  "import sys\n",
13
  "sys.path.append(\"..\")\n",
14
  "from src.preprocessing import PreprocessingPipeline\n",
15
- "import pandas as pd"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  ]
17
  },
18
  {
 
12
  "import sys\n",
13
  "sys.path.append(\"..\")\n",
14
  "from src.preprocessing import PreprocessingPipeline\n",
15
+ "import pandas as pd\n",
16
+ "import vaex"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {},
22
+ "source": [
23
+ "----\n",
24
+ "### Test vaex"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 2,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "df = pd.read_csv(\"../data/test_en.csv\")"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "data": {
43
+ "text/html": [
44
+ "<table>\n",
45
+ "<thead>\n",
46
+ "<tr><th># </th><th>label </th><th>text </th></tr>\n",
47
+ "</thead>\n",
48
+ "<tbody>\n",
49
+ "<tr><td><i style='opacity: 0.6'>0</i> </td><td>0 </td><td>&quot;I think it&#x27;s time John Rambo move on with his l...</td></tr>\n",
50
+ "<tr><td><i style='opacity: 0.6'>1</i> </td><td>1 </td><td>&quot;I&#x27;ve just watch 2 films of Pang brothers, The E...</td></tr>\n",
51
+ "<tr><td><i style='opacity: 0.6'>2</i> </td><td>1 </td><td>&#x27;Jewel Thief is *THE* crime thriller of Bollywoo...</td></tr>\n",
52
+ "<tr><td><i style='opacity: 0.6'>3</i> </td><td>0 </td><td>&#x27;This so called remake is terrible. I went to se...</td></tr>\n",
53
+ "<tr><td><i style='opacity: 0.6'>4</i> </td><td>1 </td><td>&#x27;When Northfork debuted at the Cannes Film Festi...</td></tr>\n",
54
+ "<tr><td>... </td><td>... </td><td>... </td></tr>\n",
55
+ "<tr><td><i style='opacity: 0.6'>4,995</i></td><td>0 </td><td>&#x27;The title tells it all -- Ed Gein, the butcher ...</td></tr>\n",
56
+ "<tr><td><i style='opacity: 0.6'>4,996</i></td><td>0 </td><td>&quot;This film makes about as much sense as an &#x27;Ozzi...</td></tr>\n",
57
+ "<tr><td><i style='opacity: 0.6'>4,997</i></td><td>0 </td><td>&#x27;&quot;Sex and the City&quot; has some great things going ...</td></tr>\n",
58
+ "<tr><td><i style='opacity: 0.6'>4,998</i></td><td>0 </td><td>&#x27;Please...if anybody gets the chance to read thi...</td></tr>\n",
59
+ "<tr><td><i style='opacity: 0.6'>4,999</i></td><td>0 </td><td>&#x27;...a film comes along that manages to be absolu...</td></tr>\n",
60
+ "</tbody>\n",
61
+ "</table>"
62
+ ],
63
+ "text/plain": [
64
+ "# label text\n",
65
+ "0 0 \"I think it's time John Rambo move on with his l...\n",
66
+ "1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
67
+ "2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
68
+ "3 0 'This so called remake is terrible. I went to se...\n",
69
+ "4 1 'When Northfork debuted at the Cannes Film Festi...\n",
70
+ "... ... ...\n",
71
+ "4,995 0 'The title tells it all -- Ed Gein, the butcher ...\n",
72
+ "4,996 0 \"This film makes about as much sense as an 'Ozzi...\n",
73
+ "4,997 0 '\"Sex and the City\" has some great things going ...\n",
74
+ "4,998 0 'Please...if anybody gets the chance to read thi...\n",
75
+ "4,999 0 '...a film comes along that manages to be absolu..."
76
+ ]
77
+ },
78
+ "execution_count": 3,
79
+ "metadata": {},
80
+ "output_type": "execute_result"
81
+ }
82
+ ],
83
+ "source": [
84
+ "vaex.from_pandas(df)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 4,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "df_small = df.iloc[:1000]"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 5,
99
+ "metadata": {},
100
+ "outputs": [
101
+ {
102
+ "data": {
103
+ "text/html": [
104
+ "<table>\n",
105
+ "<thead>\n",
106
+ "<tr><th># </th><th>label </th><th>text </th></tr>\n",
107
+ "</thead>\n",
108
+ "<tbody>\n",
109
+ "<tr><td><i style='opacity: 0.6'>0</i> </td><td>0 </td><td>&quot;I think it&#x27;s time John Rambo move on with his l...</td></tr>\n",
110
+ "<tr><td><i style='opacity: 0.6'>1</i> </td><td>1 </td><td>&quot;I&#x27;ve just watch 2 films of Pang brothers, The E...</td></tr>\n",
111
+ "<tr><td><i style='opacity: 0.6'>2</i> </td><td>1 </td><td>&#x27;Jewel Thief is *THE* crime thriller of Bollywoo...</td></tr>\n",
112
+ "<tr><td><i style='opacity: 0.6'>3</i> </td><td>0 </td><td>&#x27;This so called remake is terrible. I went to se...</td></tr>\n",
113
+ "<tr><td><i style='opacity: 0.6'>4</i> </td><td>1 </td><td>&#x27;When Northfork debuted at the Cannes Film Festi...</td></tr>\n",
114
+ "<tr><td>... </td><td>... </td><td>... </td></tr>\n",
115
+ "<tr><td><i style='opacity: 0.6'>995</i></td><td>1 </td><td>&quot;It&#x27;s a funny business, reviewing movies. These ...</td></tr>\n",
116
+ "<tr><td><i style='opacity: 0.6'>996</i></td><td>1 </td><td>&#x27;Right from the start you see that &quot;Anchors Awei...</td></tr>\n",
117
+ "<tr><td><i style='opacity: 0.6'>997</i></td><td>0 </td><td>&#x27;I saw this movie in NEW York city. I was waitin...</td></tr>\n",
118
+ "<tr><td><i style='opacity: 0.6'>998</i></td><td>0 </td><td>&#x27;Firstly, this is NOT an adaptation of a Stephen...</td></tr>\n",
119
+ "<tr><td><i style='opacity: 0.6'>999</i></td><td>1 </td><td>&quot;Barbra Streisand&#x27;s debut television special is ...</td></tr>\n",
120
+ "</tbody>\n",
121
+ "</table>"
122
+ ],
123
+ "text/plain": [
124
+ "# label text\n",
125
+ "0 0 \"I think it's time John Rambo move on with his l...\n",
126
+ "1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
127
+ "2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
128
+ "3 0 'This so called remake is terrible. I went to se...\n",
129
+ "4 1 'When Northfork debuted at the Cannes Film Festi...\n",
130
+ "... ... ...\n",
131
+ "995 1 \"It's a funny business, reviewing movies. These ...\n",
132
+ "996 1 'Right from the start you see that \"Anchors Awei...\n",
133
+ "997 0 'I saw this movie in NEW York city. I was waitin...\n",
134
+ "998 0 'Firstly, this is NOT an adaptation of a Stephen...\n",
135
+ "999 1 \"Barbra Streisand's debut television special is ..."
136
+ ]
137
+ },
138
+ "execution_count": 5,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": [
144
+ "vaex.from_pandas(df_small)"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": []
153
+ },
154
+ {
155
+ "cell_type": "markdown",
156
+ "metadata": {},
157
+ "source": [
158
+ "----"
159
  ]
160
  },
161
  {