For some time I have been using Apache Solr for fast querying and faceting.
Once I needed to take the dump of whole data from solr. So I implemented a function which works as python generator and gives you the dump of data from your solr collection.
Below is the function.
def get_cursor_from_solr_using_cursor_mark(solr_query,timeout=5,num_rows=None,cursor_mark="*",num_rows_in_one_shot=10,unique_field_name="id"):
"""
The function imitates a cursor for bringing data from solr.
num_rows is how many row one wants from solr. pass it None if all rows needed.
cursor_mark is cursorMark query parameter of solr,
num_rows_in_one_shot is kind of tuning factor for querying again and again until required number of rows are returned.
*** solr query must not include rows and start parameters .***
"""
if num_rows is None: #when all the rows are required
fetch_more=True
while fetch_more is True:
num_rows_part="&rows="+str(num_rows_in_one_shot)+"&sort="+str(unique_field_name)+"+asc&cursorMark="+str(cursor_mark)+"&timeAllowed=-1"
r=requests.get(solr_query+num_rows_part,timeout=timeout)
solr_dict=r.json()
num_docs=len(solr_dict['response']['docs'])
cursor_mark=solr_dict.get('nextCursorMark')
if num_docs==0 or cursor_mark is None:
fetch_more=False
for each_dict in solr_dict['response']['docs']:
yield (each_dict,cursor_mark)
else: #when given number of rows are required
for i in xrange(int(math.ceil(num_rows/float(num_rows_in_one_shot)))):
fetch_more=True
if fetch_more is True:
q,r=divmod(num_rows,(i+1)*num_rows_in_one_shot)
if q==0 and r==0:
break
if q==0:
rows=r-(i)*num_rows_in_one_shot
else:
rows=num_rows_in_one_shot
num_rows_part="&rows="+str(rows)+"&sort="+str(unique_field_name)+"+asc&cursorMark="+str(cursor_mark)+"&timeAllowed=-1"
r=requests.get(solr_query+num_rows_part,timeout=timeout)
solr_dict=r.json()
num_docs=len(solr_dict['response']['docs'])
cursor_mark=solr_dict.get('nextCursorMark')
if num_docs==0 or cursor_mark is None:
fetch_more=False
for each_dict in solr_dict['response']['docs']:
yield each_dict
Usage of function:
>>>source_solr_ip=“127.0.0.1”
>>>source_solr_port=8983
>>>collection_name=“test”
>>>query=“http://”+source_solr_ip+“:”+str(source_solr_port)+“/”+“solr/”+collection_name+“/select?q=”+“:”+“&wt=json”
>>>solr_dump_gen=get_cursor_from_solr_using_cursor_mark(query)
>>>for each in solr_dump_gen:
…. print each