File size: 9,519 Bytes
9be4956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import json
import os
import re
import sys
from tools.flights.apis import Flights
from tools.accommodations.apis import Accommodations
from tools.restaurants.apis import Restaurants
from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
from tools.googlePlaces.apis import GooglePlaces
from tools.attractions.apis import Attractions
from annotation.src.utils import get_valid_name_city,extract_before_parenthesis
from tqdm import tqdm
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
os.chdir(os.path.dirname(os.path.abspath(__file__)))
flight = Flights()
accommodations = Accommodations()
restaurants = Restaurants()
googleDistanceMatrix = GoogleDistanceMatrix()
googlePlaces = GooglePlaces()
attractions = Attractions()
def load_line_json_data(filename):
data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f.read().strip().split('\n'):
unit = json.loads(line)
data.append(unit)
return data
def extract_numbers_from_filenames(directory):
# Define the pattern to match files
pattern = r'annotation_(\d+).json'
# List all files in the directory
files = os.listdir(directory)
# Extract numbers from filenames that match the pattern
numbers = [int(re.search(pattern, file).group(1)) for file in files if re.match(pattern, file)]
return numbers
def extract_from_to(text: str):
"""
Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
Args:
- text (str): The input string.
Returns:
- tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
"""
pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
matches = re.search(pattern, text)
return matches.groups() if matches else (None, None)
def extract_city_list(query_data, annotated_data):
city_list = []
for unit in annotated_data[:query_data['days']]:
if 'from' in unit['current_city']:
from_city, to_city = extract_from_to(unit['current_city'])
from_city = extract_before_parenthesis(from_city)
to_city = extract_before_parenthesis(to_city)
if from_city not in city_list:
city_list.append(from_city)
if to_city not in city_list:
city_list.append(to_city)
else:
city = extract_before_parenthesis(unit['current_city'])
if city not in city_list:
city_list.append(city)
return city_list
# if __name__ == '__main__':
# user_name = 'all'
# directory = '../data/annotation/{}'.format(user_name)
# query_data_list = load_line_json_data('../data/query/{}.jsonl'.format(user_name))
# numbers = extract_numbers_from_filenames(directory)
# print(numbers)
# for number in tqdm(numbers):
# json_data = json.load(open(os.path.join(directory, 'annotation_{}.json'.format(number))))
# query_data = query_data_list[number-1]
# city_list = extract_city_list(query_data,json_data)
# human_collected_info = []
# for city in city_list[1:]:
# attractions_data = attractions.run(city)
# if type(attractions_data) != str:
# attractions_data.drop(['Latitude','Longitude','Address','Phone','Website','City'],axis=1,inplace=True)
# if type(attractions_data) != str:
# attractions_data = attractions_data.to_string(index=False)
# restaurants_data = restaurants.run(city)
# restaurants_data.drop(['City'],axis=1,inplace=True)
# if type(restaurants_data) != str:
# restaurants_data = restaurants_data.to_string(index=False)
# accommodations_data = accommodations.run(city)
# accommodations_data.drop(['city'],axis=1,inplace=True)
# if type(accommodations_data) != str:
# accommodations_data = accommodations_data.to_string(index=False)
# human_collected_info.append({"Description":"Attractions in {}".format(city),"Content":attractions_data})
# human_collected_info.append({"Description":"Restaurants in {}".format(city),"Content":restaurants_data})
# human_collected_info.append({"Description":"Accommodations in {}".format(city),"Content":accommodations_data})
# for idx, unit in enumerate(json_data):
# if unit != {}:
# if 'from' in unit['current_city']:
# from_city, to_city = extract_from_to(unit['current_city'])
# from_city = extract_before_parenthesis(from_city)
# to_city = extract_before_parenthesis(to_city)
# date = query_data_list[number-1]['date'][idx]
# flight_data = flight.run(from_city, to_city, date)
# if type(flight_data) != str:
# flight_data.drop(['OriginCityName','DestCityName','Distance','FlightDate'],axis=1,inplace=True)
# flight_data = flight_data.to_string(index=False)
# human_collected_info.append({"Description":"Flight from {} to {} on {}".format(from_city, to_city, date), "Content":flight_data})
# self_driving_data = googleDistanceMatrix.run(from_city, to_city,mode="self-driving")
# human_collected_info.append({"Description":"Self-driving from {} to {}".format(from_city, to_city), "Content":self_driving_data})
# taxi_data = googleDistanceMatrix.run(from_city, to_city, mode='taxi')
# human_collected_info.append({"Description":"Taxi from {} to {}".format(from_city, to_city), "Content":taxi_data})
# # write to json file
# with open(os.path.join(directory, 'human_collected_info_{}.json'.format(number)), 'w', encoding='utf-8') as f:
# json.dump(human_collected_info, f, indent=4, ensure_ascii=False)
# # break
if __name__ == '__main__':
set_type = ['train','dev','test'][2]
directory = '/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}'.format(set_type)
query_data_list = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}/query/query.jsonl'.format(set_type))
numbers = [i for i in range(1,len(query_data_list)+1)]
for number in tqdm(numbers):
json_data = json.load(open(os.path.join(directory, 'plan/plan_{}.json'.format(number))))[1]
query_data = query_data_list[number-1]
city_list = extract_city_list(query_data,json_data)
human_collected_info = []
for city in city_list[1:]:
attractions_data = attractions.run(city)
# if type(attractions_data) != str:
# attractions_data.drop(['Latitude','Longitude','Address','Phone','Website','City'],axis=1,inplace=True)
if type(attractions_data) != str:
attractions_data = attractions_data.to_string(index=False)
restaurants_data = restaurants.run(city)
# restaurants_data.drop(['City'],axis=1,inplace=True)
if type(restaurants_data) != str:
restaurants_data = restaurants_data.to_string(index=False)
accommodations_data = accommodations.run(city)
# accommodations_data.drop(['city'],axis=1,inplace=True)
if type(accommodations_data) != str:
accommodations_data = accommodations_data.to_string(index=False)
human_collected_info.append({"Description":"Attractions in {}".format(city),"Content":attractions_data})
human_collected_info.append({"Description":"Restaurants in {}".format(city),"Content":restaurants_data})
human_collected_info.append({"Description":"Accommodations in {}".format(city),"Content":accommodations_data})
for idx, unit in enumerate(json_data):
if unit != {}:
if 'from' in unit['current_city']:
from_city, to_city = extract_from_to(unit['current_city'])
from_city = extract_before_parenthesis(from_city)
to_city = extract_before_parenthesis(to_city)
date = query_data_list[number-1]['date'][idx]
flight_data = flight.run(from_city, to_city, date)
if type(flight_data) != str:
# flight_data.drop(['OriginCityName','DestCityName','Distance','FlightDate'],axis=1,inplace=True)
flight_data = flight_data.to_string(index=False)
human_collected_info.append({"Description":"Flight from {} to {} on {}".format(from_city, to_city, date), "Content":flight_data})
self_driving_data = googleDistanceMatrix.run(from_city, to_city,mode="self-driving")
human_collected_info.append({"Description":"Self-driving from {} to {}".format(from_city, to_city), "Content":self_driving_data})
taxi_data = googleDistanceMatrix.run(from_city, to_city, mode='taxi')
human_collected_info.append({"Description":"Taxi from {} to {}".format(from_city, to_city), "Content":taxi_data})
# write to json file
with open(os.path.join(directory, 'plan/human_collected_info_{}.json'.format(number)), 'w', encoding='utf-8') as f:
json.dump(human_collected_info, f, indent=4, ensure_ascii=False)
# break
|