Hi Expert,
I am trying to use model id for form recognizer py script. Here is my code . currently I am using endpoint url and api key and wanted to export data. how can use it .. here is the code
from django.shortcuts import render
import os
from django.http import HttpResponse
import csv
import re
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
from azure.storage.blob import BlobClient
# Create your views here.
def download_blob(blob_name, output_path):
"""
Download
:param blob_name:
:param output_path:
:return:
"""
_, filename = os.path.split(blob_name)
destination_file = os.path.join(output_path, filename)
blob_client = BlobClient.from_connection_string(
conn_str='DefaultEndpointsProtocol=https;AccountName=demoretail;AccountKey=jSZtsbMoGpmViFuWtTXDwEJEktIs24oUAIPSz9tSiZ25zCPe0mFRWC6V0gvlZCcGU0HcxCTdV1GsAl5vMwnanA==;EndpointSuffix=core.windows.net',
container_name='demo',
blob_name=blob_name
)
with open(destination_file, "wb") as my_blob:
blob_data = blob_client.download_blob()
blob_data.readinto(my_blob)
return destination_file
def recognize_form_tables(form_path):
endpoint = https://Test1.cognitiveservices.azure.com/
credential = AzureKeyCredential("<key>")
form_recognizer_client = FormRecognizerClient(endpoint, credential)
with open(form_path, "rb") as fd:
form = fd.read()
os.remove(form_path)
response = form_recognizer_client.begin_recognize_content(form)
form_pages = response.result()
tables = []
table_label_data = []
port_regex = '^col1:(.*)'
header_regex = '.*col1:(.*)Area Name:(.*)Month Reporting:\s*([A-Za-z]{3}-[0-9]{2}).*'
table_index = -1
for content in form_pages:
for table in content.tables:
tables.append(table)
table_header = ''
i = 0
flag = False
for line_idx, line in enumerate(content.lines):
port_line = re.findall(port_regex, line.text)
if port_line:
table_index += 1
i = 0
flag = True
if flag and i < 10 :
table_header += line.text + ' '
if i == 10:
header_match = re.match(header_regex, table_header)
if header_match:
gr = header_match.groups()
table_label_data.append([gr[0], gr[1], gr[2]])
table_header = ''
flag = False
i += 1
return tables, table_label_data
def create_csv(table, path):
with open(path, 'a') as f:
writer = csv.writer(f)
for row in table:
if len(row) < 10 or not row[3]:
continue
writer.writerow(row)
def create_csv_data(tables, table_label_data):
count = 0
for t in tables:
count += 1
table_data = []
row_index = -1
for cell in t.cells:
cell = cell.to_dict()
if count > 1 and 'is_header' in cell and cell['is_header']:
continue
elif cell['row_index'] == row_index or (count > 1 and cell['row_index'] == row_index + 1):
table_data[row_index].append(cell['text'])
else:
row_index += 1
if 'is_header' in cell and cell['is_header']:
table_data.append(['Port', 'Area Name', 'Month Reporting'])
else:
table_data.append([])
if len(table_label_data) > count:
table_data[row_index] = table_label_data[count - 1] + table_data[row_index]
table_data[row_index].append(cell['text'])
create_csv(table_data, f'table.csv')
print('Created or updated table.csv file.')
def index(request):
form_path = download_blob('Test.pdf', '')
tables, table_label_data = recognize_form_tables(form_path)
print('form recognize success')
create_csv_data(tables, table_label_data)
with open('table.csv', newline='') as in_file:
with open('Test.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
for row in csv.reader(in_file):
if row:
writer.writerow(row)
return HttpResponse("Load Succeeded")
