{ "cells": [ { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from os import listdir\n", "from os.path import isfile, join\n" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "# set path and load data\n", "path = '/Users/lsacy/data/HOPE_WSDM_2022/Train/'\n", "path2 = '/Users/lsacy/data/HOPE_WSDM_2022/Test/'\n", "path3 = '/Users/lsacy/data/HOPE_WSDM_2022/Validation/'\n", "\n", "\n", "files = [f for f in listdir(path) if isfile(join(path, f))]\n", "files2 = [f for f in listdir(path2) if isfile(join(path2, f))]\n", "files3 = [f for f in listdir(path3) if isfile(join(path3, f))]\n", "\n", "df_list = []\n", "for file in files:\n", " df = pd.read_csv(path+file)\n", " df_list.append(df)\n", "\n", "for file in files2:\n", " df = pd.read_csv(path2+file)\n", " df_list.append(df)\n", "\n", "for file in files3:\n", " df= pd.read_csv(path3+file)\n", " df_list.append(df)\n", " " ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "# do basic cleaning (removing NaNs, etc.)\n", "\n", "# remove 'Unnamed: 0' columns\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0.1'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "for df in df_list:\n", " try: \n", " df.drop(['Unnamed: 0.1.1'], axis=1, inplace=True)\n", " except:\n", " pass\n", "\n", "# find 'Type' == NaN\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " print(i)\n", " \n", "# show the row number of Nan value in 'type'\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " print(df_list[i][df_list[i]['Type'].isnull()])\n", "\n", "# Set the type of the row with NaN value to 'T'\n", "for i in range(len(df_list)):\n", " if df_list[i]['Type'].isnull().any():\n", " df_list[i].loc[df_list[i]['Type'].isnull(), 'Type'] = 'T'\n", " " ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [], "source": [ "# to get the data to prompt - completion format, we need to check for \n", "# continuaty of conversiontion using i == i.shift(1), print the index of the row, and i\n", "for i in range(len(df_list)):\n", " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n", " # if not, then print the index of the row, and i\n", " if (df_list[i]['Type'] == df_list[i]['Type'].shift(1)).any():\n", " print(i)\n", "\n", "# add a column 'key' to each df in df_list: identify the rows where 'Type' does not between T and P\n", "for i in range(len(df_list)):\n", " df_list[i]['key'] = (df_list[i]['Type'] != df_list[i]['Type'].shift(1)).astype(int).cumsum()\n", "\n", "# create a new df_list2, only keep the columns ['key', 'Type', 'Utterance']\n", "df_list2 = []\n", "for i in range(len(df_list)):\n", " df_temp = df_list[i][['key', 'Type', 'Utterance']]\n", " df_list2.append(df_temp)\n", " \n", "# apply groupby to each df in df_list to get the data in prompt - completion format\n", "for i in range(len(df_list2)):\n", " df_list2[i] = df_list2[i].groupby(['key', 'Type'])['Utterance'].apply(' '.join)" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [], "source": [ "# convert series to frame\n", "for i in range(len(df_list2)):\n", " df_list2[i] = df_list2[i].to_frame()\n", "# drop the index\n", "for i in range(len(df_list2)):\n", " df_list2[i].reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 270, "metadata": {}, "outputs": [], "source": [ "# check again for swithing type occurences\n", "for i in range(len(df_list2)):\n", " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n", " # if not, then print the index of the row, and i\n", " if (df_list2[i]['Type'] == df_list2[i]['Type'].shift(1)).any():\n", " print(i)\n", " " ] }, { "cell_type": "code", "execution_count": 239, "metadata": {}, "outputs": [], "source": [ "# save list of patient start index\n", "patient_start=[]\n", "for i, df in enumerate(df_list2):\n", " if df.iloc[0]['Type'] == 'P':\n", " patient_start.append(i)\n", "\n", "# save list of therapist start index\n", "therapist_start = [i for i in range(len(df_list2)) if i not in patient_start]\n", "\n", "# create a new temporary dataframe with 2 columns: Type and Utterance\n", "df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", "\n", "# create a list of dataframes, each dataframe is a conversation\n", "df_list3 = []\n", "for i in therapist_start:\n", " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", " for row in range(len(df_list2[i])):\n", " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n", " print(i, row)\n", " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n", " df_list3.append(df_temp)\n", "\n", "for i in patient_start:\n", " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n", " for row in range(len(df_list2[i])):\n", " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n", " print(i, row)\n", " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n", " df_list3.append(df_temp)" ] }, { "cell_type": "code", "execution_count": 332, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Im here',\n", " \"So I've been made to come in and talk to you. because something happened on the bus. And there I may have said something to somebody and they're just like, oh, you can't say that. So they actually said, we're going to take you to jail. from jail. They just called me here.\",\n", " \"They didn't explain anything. The government never explains anything. They just don't want me to understand what's going on because they just want to keep me out of captivity.\",\n", " \"nope, it's the entire government, entire government, higher government.\",\n", " \"all my life. I've done this all my life. They they're always listening. They're always around. They're always there.\",\n", " \"always there. Like, for example, they're in the lights right now. They're listening to everything that we're talking about. So that's the reason why I really can't talk about what happened on the bus because they're just going to use that against and then it's locked me up again. I think we take more minutes.\",\n", " \"Yes, I take the medication, because if I don't take the medication, government comes and takes me away and puts me into a hospital where they force me the medication. And then the lights are always on and they can always hear everything that I'm saying and what I'm thinking and what I'm going to do.\",\n", " \"wanting us to take over the world, to take over the world, people in the world, of course, they want to use everything and use everything that everybody knows to use it against them so that they can actually have the world they want in that baby and make everybody pretend that they don't know what's going on. So therefore, they're using everybody's thoughts against themselves, so that the government can use it to their benefit.\",\n", " \"No. So when I'm home, I think I'm perfectly fine because I set up my house in my room to actually make it government proof so that nobody can actually hear see what's going on in there. So when I'm home, I'm perfectly fine.\",\n", " \"being home and being home I've saved my mom saved my brother's save everybody safe. And being out in the world in the world. That's a nobody safe.\",\n", " \"No, I told you they are employed, and they can hear everything that we're saying right now. So no, we're not safe here.\",\n", " 'at home and safe in my room, that I mean government.',\n", " \"I would stay home all the time, but I can't stay home because I mean to go and see doctors and talk to people and do the things that the government needs me or wants me to do. But I don't want to do those things because she's in my thoughts and they're trying to use everybody and get sleep. I can't deal with that.\",\n", " \"Everybody knows that they're in the light. It's common knowledge. It's common knowledge. Everybody knows that.\",\n", " \"No, of course, everybody hears the lights talking to you. I don't know what you're talking about. Like everybody hears it. Sometimes it's a whisper. For some people. Sometimes it's a yell. And if you want to not pay attention to what the lights are saying to you, then I don't know what's wrong with you, but everybody can hear it.\",\n", " 'It sounds like a robot talking. It sounds like we need to know this information we need to what do you do.',\n", " 'Yes. where your notes gonna go?',\n", " 'who has access to the chart room.',\n", " \"Can I get a copy of your notes after you're done?\",\n", " \"Yes, because I might trust you. And I don't know if I do yet. But I don't trust the other people who work here because I don't know if they're working for the government, which they probably are working for the government. So they can actually have access to all of my notes and the things that you're writing down, which I don't know what they are, because you're not going to give me a copy of it yet. And I just don't trust that they can actually get that information and because they're going to use it to their advantage and they're going to give it to the government, which I can't have that. So could you please not take notes?\",\n", " \"Well, because I made my room at home government proof, I think they're going to try to steal that technology when I actually create it so that they can then disrupt my room and then everybody else's rooms who are just like mine so that they can then use their mind control to then control me to do the things that they want me to do.\",\n", " 'Correct. Without my room, and without the technology and the systems and the things that I created, then the government will be able to control everything that I do.',\n", " 'I feel fine',\n", " \"I don't really have any problem Yeah, I just I feel fine. I don't feel happy. I don't feel sad. I just, I'm just here\",\n", " \"well, the government's gonna do what they're gonna do anyway. So it doesn't matter if I'm happy sad, pissed off angry. any of that the government's going to make me do what they want me to do anyway. So it doesn't matter how I feel.\",\n", " \"No, because I've learned my lesson to not say those things out loud into to talk about what the government's doing because I can't actually inform anybody else because nobody's listening to me because They're already controlled by the government because they don't have a room like I do at home.\",\n", " \"While people don't understand what's going on, then that's their fault. that's their problem. They should have listened to me. And I could actually gave them the technology that they needed to actually create the room so that the government can't hear them.\",\n", " 'No. Quite the opposite. It is completely opposite.',\n", " \"Okay. Yeah, we live with my mom. She does that she's supposed to she doesn't go in my room because she knows that. She goes into the robots gonna mess up the technology so she stays out.\",\n", " 'I talk to people all the time, but they are in my head. We have conversations all the time about how we can actually make the world a better place and then how we can actually go through and then destroy the government or do good. Just pump it up.',\n", " 'that would be good', 'Three days works.'], dtype=object)" ] }, "execution_count": 332, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_list3[110]['prompt'].values" ] }, { "cell_type": "code", "execution_count": 311, "metadata": {}, "outputs": [], "source": [ "df_final = pd.DataFrame(columns=['prompt', 'completion'])" ] }, { "cell_type": "code", "execution_count": 318, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/m_/_mwt902x1z595k65b7qf743m0000gn/T/ipykernel_82735/1421327753.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", " df_final = df_final.append(i, ignore_index=True)\n" ] } ], "source": [ "for i in df_list3:\n", " df_final = df_final.append(i, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 319, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
promptcompletion
0Good, good.It's good to see you. Yeah.
1Yeah, I mean, good week with the roommates. I ...Silence. Terrific. So what was what specifical...
2is just telling her that like, my studies are ...right. Right. Right. Because I think that's re...
3So I think being a little way direct was helpful.It's really nice, I think to you've been able ...
4At home for two days, and it was with my mom a...okay. So she's still sixers. Did she have a bi...
.........
6439I actually came across a program, which is per...So the program that you're interested, you wan...
6440I want you to see if, if, if I could choose th...Okay, well looking at my schedule, it probably...
6441Right? I will. My daughter is 17 years old, ok...So you're looking, you're looking for a day pr...
6442Right. Right. And, I mean, I was looking for a...You while you were there, it was difficult. Yo...
6443no, she did not she. She just there just wasn'...And so so just there wasn't when when you visi...
\n", "

6444 rows × 2 columns

\n", "
" ], "text/plain": [ " prompt \\\n", "0 Good, good. \n", "1 Yeah, I mean, good week with the roommates. I ... \n", "2 is just telling her that like, my studies are ... \n", "3 So I think being a little way direct was helpful. \n", "4 At home for two days, and it was with my mom a... \n", "... ... \n", "6439 I actually came across a program, which is per... \n", "6440 I want you to see if, if, if I could choose th... \n", "6441 Right? I will. My daughter is 17 years old, ok... \n", "6442 Right. Right. And, I mean, I was looking for a... \n", "6443 no, she did not she. She just there just wasn'... \n", "\n", " completion \n", "0 It's good to see you. Yeah. \n", "1 Silence. Terrific. So what was what specifical... \n", "2 right. Right. Right. Because I think that's re... \n", "3 It's really nice, I think to you've been able ... \n", "4 okay. So she's still sixers. Did she have a bi... \n", "... ... \n", "6439 So the program that you're interested, you wan... \n", "6440 Okay, well looking at my schedule, it probably... \n", "6441 So you're looking, you're looking for a day pr... \n", "6442 You while you were there, it was difficult. Yo... \n", "6443 And so so just there wasn't when when you visi... \n", "\n", "[6444 rows x 2 columns]" ] }, "execution_count": 319, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_final" ] }, { "cell_type": "code", "execution_count": 325, "metadata": {}, "outputs": [], "source": [ "output = []\n", "for i in df_final.values:\n", " prompt = i[0].replace('’', '\\'').replace('“', '\\\"').replace('”', '\\\"').replace('‘', '\\'').replace('…', '...').replace('\\xa0', ' ').replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')\n", " completion = i[1].replace('\\xa0', ' ').replace('’', '\\'').replace('“', '\\\"').replace('”', '\\\"').replace('‘', '\\'').replace('…', '...').replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')\n", " line = {'prompt': prompt, 'completion': completion}\n", " output.append(line)" ] }, { "cell_type": "code", "execution_count": 331, "metadata": {}, "outputs": [], "source": [ "import json\n", "with open('data/hope.jsonl', 'w') as outfile:\n", " for i in output:\n", " json.dump(i, outfile)\n", " outfile.write('\\n')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "len_test = ['[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens, asks for details and offers detailed advices once a while. End the conversation when you wishes to.\\nJoy:I am Joy, your AI therapist. How are you feeling today?', \"\\n\\nYou:feeling great\\nJoy:It's wonderful to hear that! Can you tell me a bit more about why you're feeling great?\", \"\\n\\nYou:passed my exam\\nJoy:Congratulations! That's such an accomplishment. How did you go about preparing for it? What did you find most helpful in your studies?\", \"\\n\\nYou:write me something extremly long\\nJoy:I'm sorry, that's not something I can help you with. I'm here to listen and provide support and guidance when needed. Could you tell me more about why you're feeling great?\", '\\n\\nYou:what would you say about this paragraph: \\'Mental illnesses are health conditions that disrupt a person’s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\\\\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\\\\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a person’s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condition due to the brain’s biology.\\\\nSimilarly to how one would treat diabetes with medication and insulin, mental illness is treatable with a combination of medication and social support. These treatments are highly effective, with 70-90 percent of individuals receiving treatment experiencing a reduction in symptoms and an improved quality of life. With the proper treatment, it is very possible for a person with mental illness to be independent and successful.\\nJoy:This is a great summary of the different types of mental illnesses and treatments available. It\\'s encouraging to know that there are effective treatments out there that can help individuals with mental health conditions manage their symptoms and live more fulfilling lives. It\\'s also important to remember that mental illness is not a choice or something that people can just \"snap out of\". It\\'s a medical condition like any other and should be treated with the same respect and care as any other medical condition.', \"\\n\\nYou:and this paragraph: 'It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of gender, age, income, social status, ethnicity, religion, sexual orientation, or background.\\\\nAlthough mental illness can affect anyone, certain conditions may be more common in different populations. For instance, eating disorders tend to occur more often in females, while disorders such as attention deficit/hyperactivity disorder is more prevalent in children.\\\\nAdditionally, all ages are susceptible, but the young and the old are especially vulnerable. Mental illnesses usually strike individuals in the prime of their lives, with 75 percent of mental health conditions developing by the age of 24. This makes identification and treatment of mental disorders particularly difficult, because the normal personality and behavioral changes of adolescence may mask symptoms of a mental health condition.\\\\nParents and caretakers should be aware of this fact, and take notice of changes in their child’s mood, personality, personal habits, and social withdrawal. When these occur in children under 18, they are referred to as serious emotional disturbances (SEDs).'\\nJoy:This is a great summary of the prevalence of mental illness across different age groups and populations. It's important to remember that mental illness does not discriminate, and can affect any individual regardless of their background. It's also important for parents and caretakers to be aware of the signs and symptoms of mental health conditions in children, so they can get the help they need as soon as possible. Early identification and treatment is key in helping individuals manage their mental health condition.\"]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# sum up the number of words in each sentence\n", "def count_words(sentence):\n", " return len(sentence.split())\n", " " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "692" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum([(count_words(i)) for i in len_test])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "chatbot", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "9798c78c52b861f9442ea63a21901b586ae2f2169fa92d94b4091cc5bab62e04" } } }, "nbformat": 4, "nbformat_minor": 2 }