import gzip
import json
import os
import os.path as osp
import pickle
import zipfile
from collections import Counter
import numpy as np
import pandas as pd
import torch
from huggingface_hub import hf_hub_download
from ogb.utils.url import download_url
from tqdm import tqdm
from typing import Union
from stark_qa.skb.knowledge_base import SKB
from stark_qa.tools.download_hf import download_hf_file
from stark_qa.tools.io import load_files, save_files
from stark_qa.tools.node import Node, df_row_to_dict, register_node
from stark_qa.tools.process_text import clean_data, compact_text
DATASET = {
"repo": "snap-stanford/stark",
"processed": "skb/amazon/processed.zip",
"metadata": "skb/amazon/category_list.json"
}
RAW_DATA_HEADER = {
'review_header': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2',
'qa_header': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa'
}
[docs]class AmazonSKB(SKB):
REVIEW_CATEGORIES = set([
'Amazon_Fashion', 'All_Beauty', 'Appliances', 'Arts_Crafts_and_Sewing',
'Automotive', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories',
'Clothing_Shoes_and_Jewelry', 'Digital_Music', 'Electronics', 'Gift_Cards',
'Grocery_and_Gourmet_Food', 'Home_and_Kitchen', 'Industrial_and_Scientific',
'Kindle_Store', 'Luxury_Beauty', 'Magazine_Subscriptions', 'Movies_and_TV',
'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies',
'Prime_Pantry', 'Software', 'Sports_and_Outdoors', 'Tools_and_Home_Improvement',
'Toys_and_Games', 'Video_Games'
])
QA_CATEGORIES = set([
'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby', 'Beauty',
'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Electronics',
'Grocery_and_Gourmet_Food', 'Health_and_Personal_Care', 'Home_and_Kitchen',
'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies',
'Sports_and_Outdoors', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games'
])
COMMON = set([
'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Cell_Phones_and_Accessories',
'Clothing_Shoes_and_Jewelry', 'Electronics', 'Grocery_and_Gourmet_Food', 'Home_and_Kitchen',
'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies',
'Sports_and_Outdoors', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games'
])
link_columns = ['also_buy', 'also_view']
review_columns = [
'reviewerID', 'summary', 'style', 'reviewText', 'vote', 'overall',
'verified', 'reviewTime'
]
qa_columns = [
'questionType', 'answerType', 'question', 'answer', 'answerTime'
]
meta_columns = [
'asin', 'title', 'global_category', 'category', 'price', 'brand',
'feature', 'rank', 'details', 'description'
]
candidate_types = ['product']
node_attr_dict = {
'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'],
'brand': ['brand_name'],
'category': ['category_name'],
'color': ['color_name']
}
def __init__(self,
root: Union[str, None] = None,
categories: list = ['Sports_and_Outdoors'],
meta_link_types: list = ['brand', 'category', 'color'],
max_entries: int = 25,
download_processed: bool = True,
**kwargs):
"""
Initialize the AmazonSKB class.
Args:
root (Union[str, None]): Root directory to store the dataset. If None, default HF cache paths will be used.
categories (list): Product categories.
meta_link_types (list): A list of entries in node info that are used to construct meta links.
max_entries (int): Maximum number of review & QA entries to show in the description.
download_processed (bool): Whether to download the processed data.
"""
self.root = root
self.max_entries = max_entries
if download_processed:
if (self.root is None) or (
self.root is not None
and not osp.exists(osp.join(self.root, "category_list.json"))
):
sub_category_path = osp.join(self.root, "category_list.json") if self.root is not None else None
self.sub_category_path = download_hf_file(
DATASET["repo"],
DATASET["metadata"],
repo_type="dataset",
save_as_file=sub_category_path,
)
if (self.root is None) or (
self.root is not None
and meta_link_types is not None
and not osp.exists(
osp.join(
self.root,
"processed",
"cache",
"-".join(meta_link_types),
"node_info.pkl",
)
)
):
processed_path = hf_hub_download(
DATASET["repo"], DATASET["processed"], repo_type="dataset"
)
if self.root is None:
self.root = osp.dirname(processed_path)
if not osp.exists(
osp.join(
self.root,
"processed",
"cache",
"-".join(meta_link_types),
"node_info.pkl",
)):
with zipfile.ZipFile(processed_path, "r") as zip_ref:
zip_ref.extractall(path=self.root)
print(f"Extracting downloaded processed data to {self.root}")
self.raw_data_dir = osp.join(self.root, "raw")
self.processed_data_dir = osp.join(osp.join(self.root, "processed"))
os.makedirs(self.raw_data_dir, exist_ok=True)
os.makedirs(self.processed_data_dir, exist_ok=True)
cache_path = None if meta_link_types is None else osp.join(self.processed_data_dir, 'cache', '-'.join(meta_link_types))
if cache_path is not None and osp.exists(cache_path):
print(f"Loading from {self.processed_data_dir}!")
print(f'Loading cached graph with meta link types {meta_link_types}')
processed_data = load_files(cache_path)
else:
print('Start processing raw data...')
print(f'{meta_link_types=}')
processed_data = self._process_raw(categories)
if meta_link_types:
processed_data = self.post_process(processed_data, meta_link_types=meta_link_types, cache_path=cache_path)
super(AmazonSKB, self).__init__(**processed_data, **kwargs)
def __getitem__(self, idx: int) -> Node:
"""
Get the node at the specified index.
Args:
idx (int): Index of the node.
Returns:
Node: The node at the specified index.
"""
idx = int(idx)
node_info = self.node_info[idx]
node = Node()
register_node(node, node_info)
return node
[docs] def get_chunk_info(self, idx: int,
attribute: str) -> str:
"""
Get chunk information for the specified attribute.
Args:
idx (int): Index of the node.
attribute (str): Attribute to get chunk information for.
Returns:
str: Chunk information.
"""
if not hasattr(self[idx], attribute):
return ''
node_attr = getattr(self[idx], attribute)
if 'feature' in attribute:
features = [feature for feature in node_attr if feature and 'asin' not in feature.lower()]
chunk = ' '.join(features)
elif 'review' in attribute:
chunk = ''
if node_attr:
scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",", "")) for review in node_attr]
ranks = np.argsort(-np.array(scores))
for idx, review_idx in enumerate(ranks):
review = node_attr[review_idx]
chunk += f'The review "{review["summary"]}" states that "{review["reviewText"]}". '
if idx > self.max_entries:
break
elif 'qa' in attribute:
chunk = ''
if node_attr:
for idx, question in enumerate(node_attr):
chunk += f'The question is "{question["question"]}", and the answer is "{question["answer"]}". '
if idx > self.max_entries:
break
elif 'description' in attribute and node_attr:
chunk = " ".join(node_attr)
else:
chunk = node_attr
return chunk
[docs] def get_doc_info(self, idx: int,
add_rel: bool = True,
compact: bool = False) -> str:
"""
Get document information for the specified node.
Args:
idx (int): Index of the node.
add_rel (bool): Whether to add relationship information.
compact (bool): Whether to compact the text.
Returns:
str: Document information.
"""
if self.node_type_dict[int(self.node_types[idx])] == 'brand':
return f'brand name: {self[idx].brand_name}'
if self.node_type_dict[int(self.node_types[idx])] == 'category':
return f'category name: {self[idx].category_name}'
if self.node_type_dict[int(self.node_types[idx])] == 'color':
return f'color name: {self[idx].color_name}'
node = self[idx]
doc = f'- product: {node.title}\n'
if hasattr(node, 'brand'):
doc += f'- brand: {node.brand}\n'
try:
dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ')
doc += f'- dimensions: {dimensions}\n- weight: {weight}\n'
except:
pass
if node.description:
description = " ".join(node.description).strip(" ")
if description:
doc += f'- description: {description}\n'
feature_text = '- features: \n'
if node.feature:
for feature_idx, feature in enumerate(node.feature):
if feature and 'asin' not in feature.lower():
feature_text += f'#{feature_idx + 1}: {feature}\n'
else:
feature_text = ''
if node.review:
review_text = '- reviews: \n'
scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",", "")) for review in node.review]
ranks = np.argsort(-np.array(scores))
for i, review_idx in enumerate(ranks):
review = node.review[review_idx]
review_text += f'#{review_idx + 1}:\nsummary: {review["summary"]}\ntext: "{review["reviewText"]}"\n'
if i > self.max_entries:
break
else:
review_text = ''
if node.qa:
qa_text = '- Q&A: \n'
for qa_idx, qa in enumerate(node.qa):
qa_text += f'#{qa_idx + 1}:\nquestion: "{qa["question"]}"\nanswer: "{qa["answer"]}"\n'
if qa_idx > self.max_entries:
break
else:
qa_text = ''
doc += feature_text + review_text + qa_text
if add_rel:
doc += self.get_rel_info(idx)
if compact:
doc = compact_text(doc)
return doc
[docs] def get_rel_info(self,
idx: int,
rel_types: Union[list, None] = None,
n_rel: int = -1) -> str:
"""
Get relation information for the specified node.
Args:
idx (int): Index of the node.
rel_types (Union[list, None]): List of relation types or None if all relation types are included.
n_rel (int): Number of relations. Default is -1 if all relations are included.
Returns:
doc (str): Relation information.
"""
doc = ''
rel_types = self.rel_type_lst() if rel_types is None else rel_types
n_also_buy = self.get_neighbor_nodes(idx, 'also_buy')
n_also_view = self.get_neighbor_nodes(idx, 'also_view')
n_has_brand = self.get_neighbor_nodes(idx, 'has_brand')
str_also_buy = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_buy)]
str_also_view = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_view)]
if n_rel > 0:
str_also_buy = str_also_buy[:n_rel]
str_also_view = str_also_view[:n_rel]
if not str_also_buy:
str_also_buy = ''
if not str_also_view:
str_also_view = ''
str_has_brand = ''
if n_has_brand:
str_has_brand = f' brand: {self[n_has_brand[0]].brand_name}\n'
str_also_buy = ''.join(str_also_buy)
str_also_view = ''.join(str_also_view)
if str_also_buy:
doc += f' products also purchased: \n{str_also_buy}'
if str_also_view:
doc += f' products also viewed: \n{str_also_view}'
if n_has_brand:
doc += str_has_brand
if doc:
doc = '- relations:\n' + doc
return doc
def _process_raw(self, categories: list) -> dict:
"""
Process raw data to construct the knowledge base.
Args:
categories (list): List of categories to process.
Returns:
dict: Processed data.
"""
if 'all' in categories:
review_categories = self.REVIEW_CATEGORIES
qa_categories = self.QA_CATEGORIES
else:
qa_categories = review_categories = categories
assert not set(categories) - self.COMMON, 'invalid categories exist'
if osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')):
print(f'Load processed data from {self.processed_data_dir}')
loaded_files = load_files(self.processed_data_dir)
loaded_files.update({
'node_types': torch.zeros(len(loaded_files['node_info'])),
'node_type_dict': {0: 'product'}
})
return loaded_files
print('Check data downloading...')
for category in review_categories:
review_header = RAW_DATA_HEADER['review_header']
if not os.path.exists(osp.join(self.raw_data_dir, f'{category}.json.gz')):
print(f'Downloading {category} data...')
download_url(f'{review_header}/categoryFiles/{category}.json.gz', self.raw_data_dir)
download_url(f'{review_header}/metaFiles2/meta_{category}.json.gz', self.raw_data_dir)
for category in qa_categories:
qa_header = RAW_DATA_HEADER['qa_header']
if not os.path.exists(osp.join(self.raw_data_dir, f'qa_{category}.json.gz')):
print(f'Downloading {category} QA data...')
download_url(f'{qa_header}/qa_{category}.json.gz', self.raw_data_dir)
if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')):
ckt_path = osp.join(self.root, 'intermediate')
os.makedirs(ckt_path, exist_ok=True)
print('Loading data... It might take a while')
df_qa_path = os.path.join(ckt_path, 'df_qa.pkl')
if os.path.exists(df_qa_path):
df_qa = pd.read_pickle(df_qa_path)
else:
df_qa = pd.concat([
read_qa(osp.join(self.raw_data_dir, f'qa_{category}.json.gz'))
for category in qa_categories
])[['asin'] + self.qa_columns]
df_qa.to_pickle(df_qa_path)
print('df_qa loaded')
df_review_path = os.path.join(ckt_path, 'df_review.pkl')
if os.path.exists(df_review_path):
df_review = pd.read_pickle(df_review_path)
else:
df_review = pd.concat([
read_review(osp.join(self.raw_data_dir, f'{category}.json.gz'))
for category in review_categories
])[['asin'] + self.review_columns]
df_review.to_pickle(df_review_path)
print('df_review loaded')
df_ucsd_meta_path = os.path.join(ckt_path, 'df_ucsd_meta.pkl')
if os.path.exists(df_ucsd_meta_path):
df_ucsd_meta = pd.read_pickle(df_ucsd_meta_path)
else:
meta_df_lst = []
for category in review_categories:
cat_review = read_review(osp.join(self.raw_data_dir, f'meta_{category}.json.gz'))
cat_review.insert(0, 'global_category', category.replace('_', ' '))
meta_df_lst.append(cat_review)
df_ucsd_meta = pd.concat(meta_df_lst)
df_ucsd_meta.to_pickle(df_ucsd_meta_path)
print('df_ucsd_meta loaded')
print('Preprocessing data...')
df_ucsd_meta = df_ucsd_meta.drop_duplicates(subset='asin', keep='first')
df_meta = df_ucsd_meta[self.meta_columns + self.link_columns]
df_review_meta = df_review.merge(df_meta, left_on='asin', right_on='asin')
unique_asin = np.unique(np.array(df_review_meta['asin']))
df_qa_reduced = df_qa[df_qa['asin'].isin(unique_asin)]
df_review_reduced = df_review[df_review['asin'].isin(unique_asin)]
df_meta_reduced = df_meta[df_meta['asin'].isin(unique_asin)].reset_index()
def get_map(df):
asin2id, id2asin = {}, {}
for idx in range(len(df)):
asin2id[df['asin'][idx]] = idx
id2asin[idx] = df['asin'][idx]
return asin2id, id2asin
print('Construct node info and graph...')
self.asin2id, self.id2asin = get_map(df_meta_reduced)
node_info = self.construct_raw_node_info(df_meta_reduced, df_review_reduced, df_qa_reduced)
edge_index, edge_types = self.create_raw_product_graph(df_meta_reduced, columns=self.link_columns)
edge_type_dict = {0: 'also_buy', 1: 'also_view'}
processed_data = {
'node_info': node_info,
'edge_index': edge_index,
'edge_types': edge_types,
'edge_type_dict': edge_type_dict
}
print(f'Saving to {self.processed_data_dir}...')
save_files(save_path=self.processed_data_dir, **processed_data)
processed_data.update({
'node_types': torch.zeros(len(processed_data['node_info'])),
'node_type_dict': {0: 'product'}
})
return processed_data
[docs] def post_process(self, raw_info: dict, meta_link_types: list, cache_path: str = None) -> dict:
"""
Post-process the raw information to add meta link types.
Args:
raw_info (dict): Raw information.
meta_link_types (list): List of meta link types to add.
cache_path (str): Path to cache the processed data.
Returns:
dict: Post-processed data.
"""
print(f'Adding meta link types {meta_link_types}')
node_info = raw_info['node_info']
edge_type_dict = raw_info['edge_type_dict']
node_type_dict = raw_info['node_type_dict']
node_types = raw_info['node_types'].tolist()
edge_index = raw_info['edge_index'].tolist()
edge_types = raw_info['edge_types'].tolist()
n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict)
for i, link_type in enumerate(meta_link_types):
if link_type == 'brand':
values = np.array([node_info_i[link_type] for node_info_i in node_info.values()
if link_type in node_info_i.keys()])
indices = np.array([idx for idx, node_info_i in enumerate(node_info.values())
if link_type in node_info_i.keys()])
elif link_type in ['category', 'color']:
value_list, indice_list = [], []
for idx, node_info_i in enumerate(node_info.values()):
if link_type in node_info_i.keys():
value_list.extend(node_info_i[link_type])
indice_list.extend([idx for _ in range(len(node_info_i[link_type]))])
values, indices = np.array(value_list), np.array(indice_list)
else:
raise Exception(f'Invalid meta link type {link_type}')
cur_n_nodes = len(node_info)
node_type_dict[n_n_types + i] = link_type
edge_type_dict[n_e_types + i] = "has_" + link_type
unique = np.unique(values)
for j, unique_j in tqdm(enumerate(unique)):
node_info[cur_n_nodes + j] = {link_type + '_name': unique_j}
ids = indices[np.array(values == unique_j)]
edge_index[0].extend(ids.tolist())
edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))])
edge_types.extend([i + n_e_types for _ in range(len(ids))])
node_types.extend([n_n_types + i for _ in range(len(unique))])
print(f'finished adding {link_type}')
edge_index = torch.LongTensor(edge_index)
edge_types = torch.LongTensor(edge_types)
node_types = torch.LongTensor(node_types)
files = {
'node_info': node_info,
'edge_index': edge_index,
'edge_types': edge_types,
'edge_type_dict': edge_type_dict,
'node_type_dict': node_type_dict,
'node_types': node_types
}
if cache_path is not None:
save_files(cache_path, **files)
return files
def _process_brand(self, brand: str) -> str:
"""
Process brand names to remove unnecessary characters.
Args:
brand (str): Brand name.
Returns:
str: Processed brand name.
"""
brand = brand.strip(" \".*+,-_!@#$%^&*();\/|<>\'\t\n\r\\")
if brand.startswith('by '):
brand = brand[3:]
if brand.endswith('.com'):
brand = brand[:-4]
if brand.startswith('www.'):
brand = brand[4:]
if len(brand) > 100:
brand = brand.split(' ')[0]
return brand
[docs] def construct_raw_node_info(self, df_meta: pd.DataFrame, df_review: pd.DataFrame, df_qa: pd.DataFrame) -> dict:
"""
Construct raw node information.
Args:
df_meta (pd.DataFrame): DataFrame containing meta information.
df_review (pd.DataFrame): DataFrame containing review information.
df_qa (pd.DataFrame): DataFrame containing QA information.
Returns:
dict: Dictionary containing node information.
"""
node_info = {idx: {'review': [], 'qa': []} for idx in range(len(df_meta))}
###################### Assign color ########################
def assign_colors(df_review, lower_limit=20):
# asign to color
df_review = df_review[['asin', 'style']]
df_review = df_review.dropna(subset=['style'])
raw_color_dict = {}
for idx, row in tqdm(df_review.iterrows()):
asin, style = row['asin'], row['style']
for key in style.keys():
if 'color' in key.lower():
try:
raw_color_dict[asin]
except:
raw_color_dict[asin] = []
raw_color_dict[asin].append(
style[key].strip().lower() if isinstance(style[key], str) else style[key][0].strip())
all_color_values = []
for asin in raw_color_dict.keys():
raw_color_dict[asin] = list(set(raw_color_dict[asin]))
all_color_values.extend(raw_color_dict[asin])
print('number of all colors', len(all_color_values))
color_counter = Counter(all_color_values)
print('number of unique colors', len(color_counter))
color_counter = {k: v for k, v in sorted(color_counter.items(), key=lambda item: item[1], reverse=True)}
selected_colors = []
for color, number in color_counter.items():
if number > lower_limit and len(color) > 2 and len(color.split(' ')) < 5 and color.isnumeric() is False:
selected_colors.append(color)
print('number of selected colors', len(selected_colors))
filtered_color_dict = {}
total_color_connections = 0
for asin in raw_color_dict.keys():
filtered_color_dict[asin] = []
for value in raw_color_dict[asin]:
if value in selected_colors:
filtered_color_dict[asin].append(value)
total_color_connections += len(filtered_color_dict[asin])
print('number of linked products', len(filtered_color_dict))
print('number of total connections', total_color_connections)
return filtered_color_dict
filtered_color_dict_path = os.path.join(self.root, 'intermediate', 'filtered_color_dict.pkl')
if os.path.exists(filtered_color_dict_path):
with open(filtered_color_dict_path, 'rb') as f:
filtered_color_dict = pickle.load(f)
else:
filtered_color_dict = assign_colors(df_review)
with open(filtered_color_dict_path, 'wb') as f:
pickle.dump(filtered_color_dict, f)
for df_meta_i in tqdm(df_meta.itertuples()):
asin = df_meta_i.asin
idx = self.asin2id[asin]
if asin in filtered_color_dict and filtered_color_dict[asin]:
node_info[idx]['color'] = filtered_color_dict[asin]
###################### Assign brand and category ########################
sub_categories = set(json.load(open(self.sub_category_path, 'r')))
for df_meta_i in tqdm(df_meta.itertuples()):
asin = df_meta_i.asin
idx = self.asin2id[asin]
for column in self.meta_columns:
if column == 'brand':
brand = self._process_brand(clean_data(getattr(df_meta_i, column)))
if brand:
node_info[idx]['brand'] = brand
elif column == 'category':
category_list = [
category.lower() for category in getattr(df_meta_i, column)
if category.lower() in sub_categories
]
if category_list:
node_info[idx]['category'] = category_list
else:
node_info[idx][column] = clean_data(getattr(df_meta_i, column))
###################### Process review and QA ########################
for name, df, colunm_names in zip(['review', 'qa'],
[df_review, df_qa],
[self.review_columns, self.qa_columns]):
for i in tqdm(range(len(df))):
df_i = df.iloc[i]
asin = df_i['asin']
idx = self.asin2id[asin]
node_info[idx][name].append(df_row_to_dict(df_i, colunm_names))
return node_info
[docs] def create_raw_product_graph(self, df: pd.DataFrame, columns: list) -> tuple:
"""
Create raw product graph.
Args:
df (pd.DataFrame): DataFrame containing meta information.
columns (list): List of columns to create edges.
Returns:
tuple: Tuple containing edge index and edge types.
"""
edge_types = []
edge_index = [[], []]
for df_i in df.itertuples():
out_node = self.asin2id[df_i.asin]
for edge_type_id, edge_type in enumerate(columns):
if isinstance(getattr(df_i, edge_type), list):
in_nodes = [self.asin2id[i] for i in getattr(df_i, edge_type) if i in self.asin2id]
edge_types.extend([edge_type_id] * len(in_nodes))
edge_index[0].extend([out_node] * len(in_nodes))
edge_index[1].extend(in_nodes)
return torch.LongTensor(edge_index), torch.LongTensor(edge_types)
[docs] def has_brand(self, idx: int, brand: str) -> bool:
"""
Check if the node has the specified brand.
Args:
idx (int): Index of the node.
brand (str): Brand name.
Returns:
bool: Whether the node has the specified brand.
"""
try:
b = self[idx].brand
if b.endswith('.com'):
b = b[:-4]
if brand.endswith('.com'):
brand = brand[:-4]
return b.lower().strip("\"") == brand.lower().strip("\"")
except:
return False
[docs] def has_also_buy(self, idx: int, also_buy_item: int) -> bool:
"""
Check if the node has the specified also_buy item.
Args:
idx (int): Index of the node.
also_buy_item (int): Item to check.
Returns:
bool: Whether the node has the specified also_buy item.
"""
try:
also_buy_lst = self.get_neighbor_nodes(idx, 'also_buy')
return also_buy_item in also_buy_lst
except:
return False
[docs] def has_also_view(self, idx: int, also_view_item: int) -> bool:
"""
Check if the node has the specified also_view item.
Args:
idx (int): Index of the node.
also_view_item (int): Item to check.
Returns:
bool: Whether the node has the specified also_view item.
"""
try:
also_view_lst = self.get_neighbor_nodes(idx, 'also_view')
return also_view_item in also_view_lst
except:
return False
[docs]def read_review(path: str) -> pd.DataFrame:
"""
Read and parse review files.
Args:
path (str): Path to the review file.
Returns:
pd.DataFrame: DataFrame containing the reviews.
"""
def parse(path: str):
with gzip.open(path, 'rb') as g:
for l in g:
yield json.loads(l)
def getDF(path: str) -> pd.DataFrame:
df = {}
for i, d in enumerate(parse(path)):
df[i] = d
return pd.DataFrame.from_dict(df, orient='index')
return getDF(path)
[docs]def read_qa(path: str) -> pd.DataFrame:
"""
Read and parse QA files.
Args:
path (str): Path to the QA file.
Returns:
pd.DataFrame: DataFrame containing the QA data.
"""
def parse(path: str):
with gzip.open(path, 'rb') as g:
for l in g:
yield eval(l)
def getDF(path: str) -> pd.DataFrame:
df = {}
for i, d in enumerate(parse(path)):
df[i] = d
return pd.DataFrame.from_dict(df, orient='index')
return getDF(path)