thanks for the help!
schema = {
"classes": [
{
"class": "Article", # name of the class
"description": "An Article class to store the article summary and its authors", # a description of what this class represents
"properties": [ # class properties
{
"name": "title",
"dataType": ["string"],
"description": "The title of the article",
},
{
"name": "summary",
"dataType": ["text"],
"description": "The summary of the article",
},
{
"name": "wordCount",
"dataType": ["int"],
"description": "The number of words in the article's summary",
},
{
"name": "hasAuthors",
"dataType": ["Author"],
"description": "The authors this article has",
},
{
"name": "hasCategory",
"dataType": ["Category"],
"description": "The category of this article",
}
]
}, {
# Write the Author class here
"class": "Author",
"description": "An Author class to store the author's name and the articles who wrote",
"properties": [
{
"name": "name",
"dataType": ["string"],
"description": "The name of the author",
},
{
"name": "wroteArticles",
"dataType": ["Article"],
"description": "The articles this author has",
}
]
}, {
# Write the Category class here
"class":"Category",
"description":"A Category class to store the category that article belongs to",
"properties":[
{
"name":"name",
"dataType":["string"],
"description":"the name of the category"
}
]
}
]
}
this is the schema and i download the news from the cnn.com as my data
import newspaper
import uuid
import json
from tqdm import tqdm
def get_articles_from_newspaper(
news_url: str,
max_articles: int=100
) -> None:
"""
Download and save newspaper articles as weaviate schemas.
Parameters
----------
newspaper_url : str
Newspaper title.
"""
objects = []
# Build the actual newspaper
news_builder = newspaper.build(news_url, memoize_articles=False)
if max_articles > news_builder.size():
max_articles = news_builder.size()
pbar = tqdm(total=max_articles)
pbar.set_description(f"{news_url}")
i = 0
while len(objects) < max_articles and i < news_builder.size():
article = news_builder.articles[i]
try:
article.download()
article.parse()
article.nlp()
if (article.title != '' and \
article.title is not None and \
article.summary != '' and \
article.summary is not None and\
article.authors):
# create an UUID for the article using its URL
article_id = uuid.uuid3(uuid.NAMESPACE_DNS, article.url)
# create the object
objects.append({
'id': str(article_id),
'title': article.title,
'summary': article.summary,
'authors': article.authors,
'word_count': len(article.summary.split())
})
pbar.update(1)
except:
# something went wrong with getting the article, ignore it
pass
i += 1
pbar.close()
return objects
data = []
data += get_articles_from_newspaper('http://cnn.com')
and then i upload my data
from weaviate.batch import Batch # for the typing purposes
from weaviate.util import generate_uuid5
def add_article(batch: Batch, article_data: dict) -> str:
article_object = {
'title': article_data['title'],
'wordCount': article_data['word_count'],
'summary': article_data['summary'].replace('\n', '') # remove newline character
}
article_id = article_data['id']
# add article to the batch
batch.add_data_object(
data_object=article_object,
class_name='Article',
uuid=article_id
)
return article_id
def add_author(batch: Batch, author_name: str) -> str:
author_object = {'name': author_name}
# generate an UUID for the Author
author_id = generate_uuid5(author_name)
# add author to the batch
# EXERCISE: call here the batch.add_data_object function to add the author to the batch
batch.add_data_object(
data_object=author_object,
class_name='Author',
uuid=author_id
)
return author_id
def add_references(batch: Batch, article_id: str, author_id: str)-> None:
# add references to the batch
## Author -> Article
batch.add_reference(
from_object_uuid=author_id,
from_object_class_name='Author',
from_property_name='wroteArticles',
to_object_uuid=article_id
)
## Article -> Author
# EXERCISE: call here the batch.add_reference function to add the article->author reference
batch.add_reference(
from_object_uuid=article_id,
from_object_class_name='Article',
from_property_name='hasAuthors',
to_object_uuid=author_id
)
client.batch.configure(batch_size=50, dynamic=True, callback=None)
with client.batch as batch:
for i in data:
# add article to the batch
article_id = add_article(batch, i)
for author in i['authors']:
# add author to the batch
author_id = add_author(batch, author)
# add cross references to the batch
add_references(batch, article_id=article_id, author_id=author_id)